# Análisis de Programas de Gobierno 2025 con SpaCy

Este notebook contiene un análisis completo de los programas de gobierno de diferentes partidos políticos para las elecciones de 2025 en Bolivia, utilizando técnicas de procesamiento de lenguaje natural con SpaCy.

## Objetivos del análisis:
1. **Análisis de frecuencias**: Palabras y entidades más comunes
2. **Análisis de sentimientos**: Usando diccionarios y modelos transformer
3. **Topic modeling**: Identificación de temas principales
4. **Comparación entre partidos**: Similitudes y diferencias

## Partidos analizados:
- ADN, AP, FP, Libre, MAS, MORENA, NGP, PDC, SUMATE, UNIDAD

In [None]:
# Verificar que todo está instalado correctamente
import spacy
import transformers
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import pandas as pd
import numpy as np

print("✅ Todas las librerías están disponibles")

# Verificar modelo de SpaCy
try:
    nlp = spacy.load('es_core_news_md')
    print("✅ Modelo de SpaCy es_core_news_md cargado")
except OSError:
    print("❌ Error: Modelo de SpaCy no encontrado")

print("✅ Todo listo para el análisis")

SyntaxError: unmatched ')' (3887913243.py, line 31)

In [None]:
# Importar librerías
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter, defaultdict
from wordcloud import WordCloud
import os
import re
from pathlib import Path

# Para análisis de sentimientos
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# Para topic modeling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from gensim import corpora, models
from gensim.models import LdaModel

# Configuración
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("✅ Librerías importadas correctamente")

In [None]:
# Cargar modelo de spaCy
nlp = spacy.load('es_core_news_md')

# Configurar analizador de sentimientos VADER
analyzer = SentimentIntensityAnalyzer()

# Cargar modelo transformer para sentimientos en español
sentiment_pipeline = pipeline(
    "sentiment-analysis", 
    model="nlptown/bert-base-multilingual-uncased-sentiment",
    tokenizer="nlptown/bert-base-multilingual-uncased-sentiment"
)

print("✅ Modelos cargados correctamente")
print(f"SpaCy model: {nlp.meta['name']} - {nlp.meta['version']}")

In [None]:
# Función para cargar y procesar los documentos
def load_government_programs():
    """
    Carga todos los programas de gobierno desde la carpeta data/2025
    """
    data_path = Path('/Users/alexojeda/dev/social-data-science/data/2025')
    programs = {}
    metadata = {}
    
    for file_path in data_path.glob('*.txt'):
        party_name = file_path.stem
        
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Extraer metadatos del header
        lines = content.split('\n')
        party_info = {}
        content_start = 0
        
        for i, line in enumerate(lines):
            if line.startswith('partido:'):
                party_info['partido_completo'] = line.split(':', 1)[1].strip()
            elif line.startswith('candidato_presidente:'):
                party_info['candidato_presidente'] = line.split(':', 1)[1].strip()
            elif line.startswith('candidato_vicepresidente:'):
                party_info['candidato_vicepresidente'] = line.split(':', 1)[1].strip()
            elif line.strip() == '---' and i > 0:
                content_start = i + 1
                break
        
        # Extraer el contenido del programa (sin metadatos)
        program_content = '\n'.join(lines[content_start:]).strip()
        
        programs[party_name] = program_content
        metadata[party_name] = party_info
    
    return programs, metadata

# Cargar los datos
programs, metadata = load_government_programs()

print(f"✅ Cargados {len(programs)} programas de gobierno")
print("\nPartidos disponibles:")
for party, info in metadata.items():
    print(f"- {party}: {info.get('partido_completo', 'N/A')}")
    print(f"  Candidato: {info.get('candidato_presidente', 'N/A')}")

In [None]:
# Función de preprocesamiento con spaCy
def preprocess_text(text, remove_stopwords=True, lemmatize=True, pos_filter=None):
    """
    Preprocesa texto usando spaCy
    
    Args:
        text: Texto a procesar
        remove_stopwords: Si remover stopwords
        lemmatize: Si lematizar
        pos_filter: Lista de POS tags a mantener (ej: ['NOUN', 'ADJ', 'VERB'])
    """
    doc = nlp(text)
    
    tokens = []
    for token in doc:
        # Filtrar tokens no deseados
        if token.is_alpha and len(token.text) > 2:
            if remove_stopwords and token.is_stop:
                continue
            if pos_filter and token.pos_ not in pos_filter:
                continue
            
            # Usar lema o texto original
            word = token.lemma_.lower() if lemmatize else token.text.lower()
            tokens.append(word)
    
    return tokens

# Función para extraer entidades nombradas
def extract_entities(text):
    """
    Extrae entidades nombradas del texto
    """
    doc = nlp(text)
    entities = []
    
    for ent in doc.ents:
        entities.append({
            'text': ent.text,
            'label': ent.label_,
            'description': spacy.explain(ent.label_)
        })
    
    return entities

print("✅ Funciones de preprocesamiento definidas")

## 1. Análisis de Frecuencias

Analicemos las palabras más frecuentes en los programas de gobierno y las entidades nombradas más mencionadas.

In [None]:
# Análisis de frecuencias por partido
frequency_analysis = {}
entity_analysis = {}
all_tokens = []
all_entities = []

for party, program in programs.items():
    # Análisis de frecuencias
    tokens = preprocess_text(program, pos_filter=['NOUN', 'ADJ', 'VERB'])
    frequency_analysis[party] = Counter(tokens)
    all_tokens.extend(tokens)
    
    # Análisis de entidades
    entities = extract_entities(program)
    entity_analysis[party] = entities
    all_entities.extend(entities)
    
    print(f"📊 {party}: {len(tokens)} tokens procesados, {len(entities)} entidades encontradas")

# Frecuencias globales
global_frequencies = Counter(all_tokens)
print(f"\n📈 Total: {len(all_tokens)} tokens, {len(set(all_tokens))} únicos")

In [None]:
# Visualización de palabras más frecuentes globalmente
fig, axes = plt.subplots(2, 2, figsize=(20, 15))

# Top 20 palabras más frecuentes
top_words = global_frequencies.most_common(20)
words, counts = zip(*top_words)

axes[0,0].barh(range(len(words)), counts)
axes[0,0].set_yticks(range(len(words)))
axes[0,0].set_yticklabels(words)
axes[0,0].set_title('Top 20 Palabras Más Frecuentes (Global)', fontsize=14, fontweight='bold')
axes[0,0].set_xlabel('Frecuencia')
axes[0,0].invert_yaxis()

# WordCloud global
wordcloud = WordCloud(width=800, height=400, background_color='white', 
                     max_words=100, colormap='viridis').generate_from_frequencies(global_frequencies)
axes[0,1].imshow(wordcloud, interpolation='bilinear')
axes[0,1].axis('off')
axes[0,1].set_title('WordCloud - Todas las Palabras', fontsize=14, fontweight='bold')

# Distribución de longitud de programas
program_lengths = [len(preprocess_text(program)) for program in programs.values()]
parties = list(programs.keys())

axes[1,0].bar(parties, program_lengths, color=sns.color_palette("husl", len(parties)))
axes[1,0].set_title('Longitud de Programas por Partido', fontsize=14, fontweight='bold')
axes[1,0].set_ylabel('Número de tokens')
axes[1,0].tick_params(axis='x', rotation=45)

# Análisis de entidades más comunes
entity_counter = Counter([ent['text'].lower() for ent in all_entities if len(ent['text']) > 2])
top_entities = entity_counter.most_common(15)
if top_entities:
    ent_names, ent_counts = zip(*top_entities)
    axes[1,1].barh(range(len(ent_names)), ent_counts)
    axes[1,1].set_yticks(range(len(ent_names)))
    axes[1,1].set_yticklabels(ent_names)
    axes[1,1].set_title('Top 15 Entidades Más Mencionadas', fontsize=14, fontweight='bold')
    axes[1,1].set_xlabel('Frecuencia')
    axes[1,1].invert_yaxis()

plt.tight_layout()
plt.show()

In [None]:
# Comparación de palabras más frecuentes por partido
fig, axes = plt.subplots(2, 5, figsize=(25, 12))
axes = axes.flatten()

for i, (party, freq_counter) in enumerate(frequency_analysis.items()):
    if i < len(axes):
        top_10 = freq_counter.most_common(10)
        if top_10:
            words, counts = zip(*top_10)
            axes[i].barh(range(len(words)), counts)
            axes[i].set_yticks(range(len(words)))
            axes[i].set_yticklabels(words, fontsize=8)
            axes[i].set_title(f'{party}\n({sum(freq_counter.values())} tokens)', 
                            fontsize=10, fontweight='bold')
            axes[i].invert_yaxis()
            axes[i].tick_params(axis='x', labelsize=8)

plt.tight_layout()
plt.suptitle('Top 10 Palabras por Partido Político', fontsize=16, fontweight='bold', y=1.02)
plt.show()

## 2. Análisis de Sentimientos

Analizaremos los sentimientos de los programas usando múltiples enfoques:
1. **VADER**: Diccionario especializado en sentimientos
2. **TextBlob**: Análisis de polaridad y subjetividad
3. **BERT Multilingual**: Modelo transformer pre-entrenado

In [None]:
# Función para análisis de sentimientos completo
def analyze_sentiment_comprehensive(text, max_length=512):
    """
    Analiza sentimientos usando múltiples métodos
    """
    # Dividir texto en chunks para modelos con límite de tokens
    chunks = [text[i:i+max_length*4] for i in range(0, len(text), max_length*4)]
    
    results = {
        'vader': {'compound': [], 'pos': [], 'neu': [], 'neg': []},
        'textblob': {'polarity': [], 'subjectivity': []},
        'bert': {'label': [], 'score': []}
    }
    
    for chunk in chunks[:3]:  # Analizar máximo 3 chunks por documento
        if len(chunk.strip()) > 10:
            # VADER
            vader_scores = analyzer.polarity_scores(chunk)
            for key in results['vader']:
                results['vader'][key].append(vader_scores[key])
            
            # TextBlob
            blob = TextBlob(chunk)
            results['textblob']['polarity'].append(blob.sentiment.polarity)
            results['textblob']['subjectivity'].append(blob.sentiment.subjectivity)
            
            # BERT (con manejo de errores)
            try:
                bert_result = sentiment_pipeline(chunk[:512])[0]
                results['bert']['label'].append(bert_result['label'])
                results['bert']['score'].append(bert_result['score'])
            except Exception as e:
                print(f"Error en BERT: {e}")
                results['bert']['label'].append('NEUTRAL')
                results['bert']['score'].append(0.5)
    
    # Promediar resultados
    final_results = {
        'vader_compound': np.mean(results['vader']['compound']) if results['vader']['compound'] else 0,
        'vader_positive': np.mean(results['vader']['pos']) if results['vader']['pos'] else 0,
        'vader_neutral': np.mean(results['vader']['neu']) if results['vader']['neu'] else 0,
        'vader_negative': np.mean(results['vader']['neg']) if results['vader']['neg'] else 0,
        'textblob_polarity': np.mean(results['textblob']['polarity']) if results['textblob']['polarity'] else 0,
        'textblob_subjectivity': np.mean(results['textblob']['subjectivity']) if results['textblob']['subjectivity'] else 0,
        'bert_label': max(set(results['bert']['label']), key=results['bert']['label'].count) if results['bert']['label'] else 'NEUTRAL',
        'bert_score': np.mean(results['bert']['score']) if results['bert']['score'] else 0.5
    }
    
    return final_results

print("✅ Función de análisis de sentimientos definida")

In [None]:
# Realizar análisis de sentimientos para todos los partidos
sentiment_results = {}

print("🔍 Analizando sentimientos...")
for party, program in programs.items():
    print(f"  Procesando {party}...")
    sentiment_results[party] = analyze_sentiment_comprehensive(program)
    
# Crear DataFrame con resultados
sentiment_df = pd.DataFrame(sentiment_results).T
sentiment_df['partido'] = sentiment_df.index
sentiment_df = sentiment_df.reset_index(drop=True)

print("\n📊 Resultados de análisis de sentimientos:")
print(sentiment_df.round(3))

In [None]:
# Visualización del análisis de sentimientos
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'VADER - Sentimiento Compuesto',
        'TextBlob - Polaridad vs Subjetividad', 
        'VADER - Distribución de Sentimientos',
        'BERT - Clasificación de Sentimientos'
    ),
    specs=[[{"type": "bar"}, {"type": "scatter"}],
           [{"type": "bar"}, {"type": "bar"}]]
)

# VADER Compound Score
fig.add_trace(
    go.Bar(
        x=sentiment_df['partido'],
        y=sentiment_df['vader_compound'],
        name='VADER Compound',
        marker_color='lightblue'
    ),
    row=1, col=1
)

# TextBlob Scatter
fig.add_trace(
    go.Scatter(
        x=sentiment_df['textblob_polarity'],
        y=sentiment_df['textblob_subjectivity'],
        mode='markers+text',
        text=sentiment_df['partido'],
        textposition='top center',
        marker=dict(size=12, color='red'),
        name='TextBlob'
    ),
    row=1, col=2
)

# VADER Distribution (stacked bar)
fig.add_trace(
    go.Bar(
        x=sentiment_df['partido'],
        y=sentiment_df['vader_positive'],
        name='Positivo',
        marker_color='green'
    ),
    row=2, col=1
)
fig.add_trace(
    go.Bar(
        x=sentiment_df['partido'],
        y=sentiment_df['vader_neutral'],
        name='Neutral',
        marker_color='gray'
    ),
    row=2, col=1
)
fig.add_trace(
    go.Bar(
        x=sentiment_df['partido'],
        y=sentiment_df['vader_negative'],
        name='Negativo',
        marker_color='red'
    ),
    row=2, col=1
)

# BERT Results
bert_counts = sentiment_df['bert_label'].value_counts()
fig.add_trace(
    go.Bar(
        x=bert_counts.index,
        y=bert_counts.values,
        name='BERT Classification',
        marker_color='purple'
    ),
    row=2, col=2
)

fig.update_layout(
    height=800,
    title_text="Análisis de Sentimientos - Programas de Gobierno 2025",
    title_x=0.5,
    showlegend=True
)

fig.update_xaxes(tickangle=45, row=1, col=1)
fig.update_xaxes(title_text="Polaridad", row=1, col=2)
fig.update_yaxes(title_text="Subjetividad", row=1, col=2)
fig.update_xaxes(tickangle=45, row=2, col=1)

fig.show()

## 3. Topic Modeling

Utilizaremos Latent Dirichlet Allocation (LDA) para identificar los temas principales en los programas de gobierno.

In [None]:
# Preparar datos para topic modeling
def prepare_documents_for_lda(programs):
    """
    Prepara documentos para LDA eliminando palabras muy comunes y muy raras
    """
    documents = []
    doc_names = []
    
    for party, program in programs.items():
        # Procesar texto con filtros más estrictos
        tokens = preprocess_text(program, pos_filter=['NOUN', 'ADJ'])
        
        # Filtrar palabras muy cortas o muy comunes en política
        political_stopwords = {
            'bolivia', 'boliviano', 'boliviana', 'país', 'estado', 'gobierno', 
            'nacional', 'público', 'social', 'económico', 'política', 'político',
            'pueblo', 'ciudadano', 'sociedad', 'desarrollo', 'gestión', 'proceso'
        }
        
        filtered_tokens = [
            token for token in tokens 
            if len(token) > 3 and token not in political_stopwords
        ]
        
        documents.append(filtered_tokens)
        doc_names.append(party)
    
    return documents, doc_names

# Preparar documentos
documents, doc_names = prepare_documents_for_lda(programs)

# Crear diccionario y corpus para Gensim
dictionary = corpora.Dictionary(documents)

# Filtrar extremos: palabras que aparecen en menos de 2 docs o más del 50% de docs
dictionary.filter_extremes(no_below=2, no_above=0.5)

corpus = [dictionary.doc2bow(doc) for doc in documents]

print(f"📚 Corpus preparado:")
print(f"  - {len(documents)} documentos")
print(f"  - {len(dictionary)} palabras únicas")
print(f"  - {sum(len(doc) for doc in corpus)} tokens totales")

In [None]:
# Entrenar modelo LDA
num_topics = 6  # Número de temas a identificar

print("🤖 Entrenando modelo LDA...")
lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=num_topics,
    random_state=42,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

print(f"✅ Modelo LDA entrenado con {num_topics} temas")

# Extraer temas y sus palabras principales
topics = []
for idx in range(num_topics):
    topic_words = lda_model.show_topic(idx, topn=10)
    topics.append({
        'topic_id': idx,
        'words': [word for word, prob in topic_words],
        'probabilities': [prob for word, prob in topic_words],
        'description': ' + '.join([f"{word}({prob:.3f})" for word, prob in topic_words[:5]])
    })

print("\n🏷️ Temas identificados:")
for i, topic in enumerate(topics):
    print(f"\nTema {i}: {topic['description']}")

In [None]:
# Asignar temas dominantes a cada documento
document_topics = []
for i, doc in enumerate(corpus):
    doc_topics = lda_model.get_document_topics(doc)
    # Obtener tema dominante
    dominant_topic = max(doc_topics, key=lambda x: x[1])
    
    document_topics.append({
        'partido': doc_names[i],
        'dominant_topic_id': dominant_topic[0],
        'dominant_topic_prob': dominant_topic[1],
        'all_topics': doc_topics
    })

# Crear DataFrame de resultados
topic_df = pd.DataFrame(document_topics)
topic_df['dominant_topic_desc'] = topic_df['dominant_topic_id'].apply(
    lambda x: topics[x]['description']
)

print("📊 Asignación de temas por partido:")
print(topic_df[['partido', 'dominant_topic_id', 'dominant_topic_prob', 'dominant_topic_desc']].round(3))

In [None]:
# Visualización de Topic Modeling
fig = make_subplots(
    rows=2, cols=2,
    subplot_titles=(
        'Distribución de Temas por Partido',
        'Probabilidad del Tema Dominante',
        'Top Palabras por Tema',
        'Matriz de Similitud de Temas'
    ),
    specs=[[{"type": "bar"}, {"type": "bar"}],
           [{"type": "bar"}, {"type": "heatmap"}]]
)

# Distribución de temas por partido
topic_counts = topic_df['dominant_topic_id'].value_counts().sort_index()
fig.add_trace(
    go.Bar(
        x=[f"Tema {i}" for i in topic_counts.index],
        y=topic_counts.values,
        name='Distribución de Temas',
        marker_color='lightgreen'
    ),
    row=1, col=1
)

# Probabilidad del tema dominante por partido
fig.add_trace(
    go.Bar(
        x=topic_df['partido'],
        y=topic_df['dominant_topic_prob'],
        name='Probabilidad Tema Dominante',
        marker_color='orange',
        text=topic_df['dominant_topic_id'],
        textposition='outside'
    ),
    row=1, col=2
)

# Top palabras del tema más común
most_common_topic = topic_counts.index[0]
top_words = topics[most_common_topic]['words'][:10]
top_probs = topics[most_common_topic]['probabilities'][:10]

fig.add_trace(
    go.Bar(
        x=top_probs,
        y=top_words,
        orientation='h',
        name=f'Tema {most_common_topic}',
        marker_color='purple'
    ),
    row=2, col=1
)

# Matriz de similitud entre temas (usando palabras top)
similarity_matrix = np.zeros((num_topics, num_topics))
for i in range(num_topics):
    for j in range(num_topics):
        words_i = set(topics[i]['words'][:10])
        words_j = set(topics[j]['words'][:10])
        similarity = len(words_i.intersection(words_j)) / len(words_i.union(words_j))
        similarity_matrix[i, j] = similarity

fig.add_trace(
    go.Heatmap(
        z=similarity_matrix,
        x=[f"Tema {i}" for i in range(num_topics)],
        y=[f"Tema {i}" for i in range(num_topics)],
        colorscale='Viridis',
        name='Similitud'
    ),
    row=2, col=2
)

fig.update_layout(
    height=800,
    title_text="Topic Modeling - Programas de Gobierno 2025",
    title_x=0.5,
    showlegend=False
)

fig.update_xaxes(tickangle=45, row=1, col=2)
fig.update_yaxes(title_text="Palabras", row=2, col=1)
fig.update_xaxes(title_text="Probabilidad", row=2, col=1)

fig.show()

## 4. Análisis Comparativo y Conclusiones

Realizaremos un análisis comparativo final que combine todos los aspectos estudiados.

In [None]:
# Análisis comparativo integral
comparative_df = sentiment_df.copy()
comparative_df = comparative_df.merge(
    topic_df[['partido', 'dominant_topic_id', 'dominant_topic_prob']], 
    on='partido'
)

# Agregar estadísticas de texto
text_stats = []
for party, program in programs.items():
    tokens = preprocess_text(program)
    entities = extract_entities(program)
    
    text_stats.append({
        'partido': party,
        'total_tokens': len(tokens),
        'unique_tokens': len(set(tokens)),
        'lexical_diversity': len(set(tokens)) / len(tokens) if tokens else 0,
        'total_entities': len(entities),
        'avg_sentence_length': len(tokens) / max(program.count('.'), 1)
    })

text_stats_df = pd.DataFrame(text_stats)
comparative_df = comparative_df.merge(text_stats_df, on='partido')

print("📊 Análisis comparativo integral:")
print(comparative_df.round(3))

In [None]:
# Visualización del análisis comparativo
fig, axes = plt.subplots(2, 3, figsize=(20, 12))

# 1. Sentimiento vs Diversidad Léxica
axes[0,0].scatter(comparative_df['lexical_diversity'], comparative_df['vader_compound'], 
                 s=100, alpha=0.7, c=comparative_df['dominant_topic_id'], cmap='tab10')
for i, party in enumerate(comparative_df['partido']):
    axes[0,0].annotate(party, 
                      (comparative_df['lexical_diversity'].iloc[i], 
                       comparative_df['vader_compound'].iloc[i]),
                      xytext=(5, 5), textcoords='offset points', fontsize=8)
axes[0,0].set_xlabel('Diversidad Léxica')
axes[0,0].set_ylabel('Sentimiento (VADER)')
axes[0,0].set_title('Sentimiento vs Diversidad Léxica')
axes[0,0].grid(True, alpha=0.3)

# 2. Longitud vs Entidades
axes[0,1].scatter(comparative_df['total_tokens'], comparative_df['total_entities'],
                 s=100, alpha=0.7, c=comparative_df['dominant_topic_id'], cmap='tab10')
for i, party in enumerate(comparative_df['partido']):
    axes[0,1].annotate(party, 
                      (comparative_df['total_tokens'].iloc[i], 
                       comparative_df['total_entities'].iloc[i]),
                      xytext=(5, 5), textcoords='offset points', fontsize=8)
axes[0,1].set_xlabel('Total de Tokens')
axes[0,1].set_ylabel('Total de Entidades')
axes[0,1].set_title('Longitud vs Entidades Nombradas')
axes[0,1].grid(True, alpha=0.3)

# 3. Distribución de sentimientos por tema
for topic_id in comparative_df['dominant_topic_id'].unique():
    topic_data = comparative_df[comparative_df['dominant_topic_id'] == topic_id]
    axes[0,2].scatter(topic_data['textblob_polarity'], topic_data['textblob_subjectivity'],
                     label=f'Tema {topic_id}', s=100, alpha=0.7)
axes[0,2].set_xlabel('Polaridad (TextBlob)')
axes[0,2].set_ylabel('Subjetividad (TextBlob)')
axes[0,2].set_title('Sentimientos por Tema Dominante')
axes[0,2].legend()
axes[0,2].grid(True, alpha=0.3)

# 4. Radar chart de características por partido (seleccionar algunos partidos)
selected_parties = ['MAS', 'MORENA', 'ADN', 'UNIDAD']
features = ['vader_compound', 'textblob_polarity', 'lexical_diversity', 'dominant_topic_prob']

angles = np.linspace(0, 2*np.pi, len(features), endpoint=False)
angles = np.concatenate((angles, [angles[0]]))

axes[1,0].set_theta_offset(np.pi / 2)
axes[1,0].set_theta_direction(-1)
axes[1,0] = plt.subplot(2, 3, 4, projection='polar')

for party in selected_parties:
    if party in comparative_df['partido'].values:
        party_data = comparative_df[comparative_df['partido'] == party]
        values = []
        for feature in features:
            # Normalizar valores entre 0 y 1
            val = party_data[feature].iloc[0]
            if feature in ['vader_compound', 'textblob_polarity']:
                val = (val + 1) / 2  # Convertir de [-1,1] a [0,1]
            values.append(val)
        values += [values[0]]  # Cerrar el polígono
        
        axes[1,0].plot(angles, values, 'o-', linewidth=2, label=party)
        axes[1,0].fill(angles, values, alpha=0.25)

axes[1,0].set_xticks(angles[:-1])
axes[1,0].set_xticklabels(features)
axes[1,0].set_title('Perfil de Características\n(Partidos Seleccionados)')
axes[1,0].legend(loc='upper right', bbox_to_anchor=(1.3, 1.0))

# 5. Heatmap de correlaciones
corr_features = ['vader_compound', 'textblob_polarity', 'textblob_subjectivity', 
                'lexical_diversity', 'total_tokens', 'total_entities', 'dominant_topic_prob']
corr_matrix = comparative_df[corr_features].corr()

im = axes[1,1].imshow(corr_matrix, cmap='RdBu_r', vmin=-1, vmax=1)
axes[1,1].set_xticks(range(len(corr_features)))
axes[1,1].set_yticks(range(len(corr_features)))
axes[1,1].set_xticklabels([f.replace('_', '\n') for f in corr_features], rotation=45, ha='right')
axes[1,1].set_yticklabels([f.replace('_', '\n') for f in corr_features])
axes[1,1].set_title('Correlaciones entre Variables')

# Agregar valores de correlación
for i in range(len(corr_features)):
    for j in range(len(corr_features)):
        text = axes[1,1].text(j, i, f'{corr_matrix.iloc[i, j]:.2f}',
                             ha="center", va="center", color="black", fontsize=8)

plt.colorbar(im, ax=axes[1,1], shrink=0.8)

# 6. Ranking general
ranking_features = ['vader_compound', 'lexical_diversity', 'total_entities', 'dominant_topic_prob']
ranking_df = comparative_df.copy()

# Normalizar features para ranking
for feature in ranking_features:
    ranking_df[f'{feature}_rank'] = ranking_df[feature].rank(ascending=False)

ranking_df['overall_rank'] = ranking_df[[f'{f}_rank' for f in ranking_features]].mean(axis=1)
ranking_df = ranking_df.sort_values('overall_rank')

axes[1,2].barh(range(len(ranking_df)), ranking_df['overall_rank'], 
              color=plt.cm.RdYlGn_r(ranking_df['overall_rank']/ranking_df['overall_rank'].max()))
axes[1,2].set_yticks(range(len(ranking_df)))
axes[1,2].set_yticklabels(ranking_df['partido'])
axes[1,2].set_xlabel('Ranking Promedio')
axes[1,2].set_title('Ranking General de Partidos')
axes[1,2].invert_yaxis()

plt.tight_layout()
plt.show()

## 5. Resumen y Conclusiones del Análisis

A continuación se presenta un resumen de los hallazgos principales del análisis de los programas de gobierno 2025:

In [None]:
# Generar resumen automático de conclusiones
print("🎯 RESUMEN DEL ANÁLISIS DE PROGRAMAS DE GOBIERNO 2025")
print("="*60)

# 1. Análisis de frecuencias
print("\n📊 ANÁLISIS DE FRECUENCIAS:")
top_5_global = global_frequencies.most_common(5)
print(f"• Palabras más frecuentes globalmente: {', '.join([word for word, _ in top_5_global])}")

longest_program = max([(party, len(preprocess_text(program))) for party, program in programs.items()], key=lambda x: x[1])
shortest_program = min([(party, len(preprocess_text(program))) for party, program in programs.items()], key=lambda x: x[1])
print(f"• Programa más extenso: {longest_program[0]} ({longest_program[1]} tokens)")
print(f"• Programa más conciso: {shortest_program[0]} ({shortest_program[1]} tokens)")

# 2. Análisis de sentimientos
print("\n😊 ANÁLISIS DE SENTIMIENTOS:")
most_positive = comparative_df.loc[comparative_df['vader_compound'].idxmax()]
most_negative = comparative_df.loc[comparative_df['vader_compound'].idxmin()]
print(f"• Programa más positivo (VADER): {most_positive['partido']} ({most_positive['vader_compound']:.3f})")
print(f"• Programa más negativo (VADER): {most_negative['partido']} ({most_negative['vader_compound']:.3f})")

most_subjective = comparative_df.loc[comparative_df['textblob_subjectivity'].idxmax()]
least_subjective = comparative_df.loc[comparative_df['textblob_subjectivity'].idxmin()]
print(f"• Programa más subjetivo: {most_subjective['partido']} ({most_subjective['textblob_subjectivity']:.3f})")
print(f"• Programa más objetivo: {least_subjective['partido']} ({least_subjective['textblob_subjectivity']:.3f})")

# 3. Topic modeling
print("\n🏷️ TOPIC MODELING:")
print(f"• Se identificaron {num_topics} temas principales")
for i, topic in enumerate(topics):
    topic_parties = topic_df[topic_df['dominant_topic_id'] == i]['partido'].tolist()
    print(f"• Tema {i}: {', '.join(topic['words'][:3])} → Partidos: {', '.join(topic_parties)}")

# 4. Características distintivas
print("\n🔍 CARACTERÍSTICAS DISTINTIVAS:")
most_diverse = comparative_df.loc[comparative_df['lexical_diversity'].idxmax()]
least_diverse = comparative_df.loc[comparative_df['lexical_diversity'].idxmin()]
print(f"• Mayor diversidad léxica: {most_diverse['partido']} ({most_diverse['lexical_diversity']:.3f})")
print(f"• Menor diversidad léxica: {least_diverse['partido']} ({least_diverse['lexical_diversity']:.3f})")

most_entities = comparative_df.loc[comparative_df['total_entities'].idxmax()]
print(f"• Más entidades nombradas: {most_entities['partido']} ({most_entities['total_entities']} entidades)")

# 5. Ranking final
print("\n🏆 RANKING GENERAL (basado en múltiples métricas):")
top_3_parties = ranking_df.head(3)
for i, (_, party_data) in enumerate(top_3_parties.iterrows(), 1):
    print(f"{i}. {party_data['partido']} (Rank: {party_data['overall_rank']:.2f})")

print("\n" + "="*60)
print("✅ Análisis completado exitosamente")

In [None]:
# Guardar resultados en archivos
import json
from datetime import datetime

# Crear directorio de resultados
results_dir = Path('/Users/alexojeda/dev/social-data-science/resultados_analisis')
results_dir.mkdir(exist_ok=True)

# Guardar resultados del análisis
results = {
    'fecha_analisis': datetime.now().isoformat(),
    'resumen': {
        'total_partidos': len(programs),
        'total_tokens': sum(len(preprocess_text(program)) for program in programs.values()),
        'palabras_mas_frecuentes': dict(global_frequencies.most_common(20)),
        'temas_identificados': {
            f'tema_{i}': {
                'palabras_principales': topic['words'][:10],
                'partidos_asociados': topic_df[topic_df['dominant_topic_id'] == i]['partido'].tolist()
            }
            for i, topic in enumerate(topics)
        }
    },
    'analisis_sentimientos': comparative_df[['partido', 'vader_compound', 'textblob_polarity', 'textblob_subjectivity']].to_dict('records'),
    'topic_modeling': topic_df[['partido', 'dominant_topic_id', 'dominant_topic_prob']].to_dict('records'),
    'estadisticas_texto': text_stats_df.to_dict('records')
}

# Guardar en JSON
with open(results_dir / 'analisis_programas_gobierno_2025.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=2)

# Guardar DataFrames en CSV
comparative_df.to_csv(results_dir / 'analisis_comparativo.csv', index=False, encoding='utf-8')
sentiment_df.to_csv(results_dir / 'analisis_sentimientos.csv', index=False, encoding='utf-8')
topic_df.to_csv(results_dir / 'topic_modeling.csv', index=False, encoding='utf-8')

print(f"💾 Resultados guardados en: {results_dir}")
print("📁 Archivos generados:")
print("  • analisis_programas_gobierno_2025.json (resumen completo)")
print("  • analisis_comparativo.csv (datos comparativos)")
print("  • analisis_sentimientos.csv (resultados de sentimientos)")
print("  • topic_modeling.csv (resultados de temas)")