In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import os

# Configuración de visualización
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# =============================================================================
# 1. CARGAR LOS DATOS
# =============================================================================

directorio_actual = os.getcwd()
print(directorio_actual)
    # df = pd.read_csv("datos\\datos_carr_sel_prepro.csv", encoding='latin1', sep=";")
    # return df
# Cargar archivos
topic_freq = pd.read_csv("datos\\topic_freq.csv", encoding='cp1252')
topic_docs = pd.read_csv("datos\\topic_docs.csv", encoding='cp1252')


C:\Users\accar\Notebooks\ProyectoTesisMCD\app


FileNotFoundError: [Errno 2] No such file or directory: 'datos\\topic_freq.csv'

In [1]:

print("="*80)
print("ANÁLISIS DE RESULTADOS DEL MODELADO DE TÓPICOS")
print("="*80)

# Mostrar primeras filas para entender estructura
print("\n1. ESTRUCTURA DE LOS DATOS")
print("-"*80)
print("\nTopic_freq (primeras 5 filas):")
print(topic_freq.head())
print(f"\nColumnas: {list(topic_freq.columns)}")
print(f"Dimensiones: {topic_freq.shape}")

print("\nTopic_docs (primeras 5 filas):")
print(topic_docs.head())
print(f"\nColumnas: {list(topic_docs.columns)}")
print(f"Dimensiones: {topic_docs.shape}")

# =============================================================================
# 2. ANÁLISIS DE FRECUENCIAS DE TÓPICOS
# =============================================================================

print("\n" + "="*80)
print("2. ANÁLISIS DE FRECUENCIAS DE TÓPICOS")
print("="*80)

# Excluir outliers (topic -1)
topic_freq_clean = topic_freq[topic_freq['Topic'] != -1].copy()
outliers = topic_freq[topic_freq['Topic'] == -1]

# Estadísticas generales
total_docs = topic_freq['Count'].sum()
total_topics = len(topic_freq_clean)
docs_outliers = outliers['Count'].values[0] if len(outliers) > 0 else 0

print(f"\nTotal de documentos: {total_docs}")
print(f"Total de tópicos identificados: {total_topics}")
print(f"Documentos outliers: {docs_outliers} ({(docs_outliers/total_docs)*100:.1f}%)")
print(f"Documentos clasificados: {total_docs - docs_outliers} ({((total_docs - docs_outliers)/total_docs)*100:.1f}%)")

# Calcular porcentajes
topic_freq_clean['Percentage'] = (topic_freq_clean['Count'] / total_docs) * 100

# Ordenar por frecuencia
topic_freq_clean = topic_freq_clean.sort_values('Count', ascending=False)

print("\n" + "-"*80)
print("DISTRIBUCIÓN DE DOCUMENTOS POR TÓPICO")
print("-"*80)
print(topic_freq_clean[['Topic', 'Name', 'Count', 'Percentage']].to_string(index=False))

# =============================================================================
# 3. ANÁLISIS DE PALABRAS CLAVE POR TÓPICO
# =============================================================================

print("\n" + "="*80)
print("3. PALABRAS CLAVE POR TÓPICO (c-TF-IDF)")
print("="*80)

# Asumiendo que las columnas de palabras clave están en topic_freq
# Identificar columnas con palabras clave (típicamente: Representation o keywords)
keyword_columns = [col for col in topic_freq.columns if 'representation' in col.lower() 
                   or 'keyword' in col.lower() or col.startswith('Word_')]

if keyword_columns:
    print(f"\nColumnas de palabras clave detectadas: {keyword_columns}")
    
    for idx, row in topic_freq_clean.iterrows():
        topic_id = row['Topic']
        topic_name = row['Name']
        count = row['Count']
        
        print(f"\n{'='*80}")
        print(f"TÓPICO {topic_id}: {topic_name}")
        print(f"Documentos: {count} ({row['Percentage']:.1f}%)")
        print(f"{'='*80}")
        
        # Extraer palabras clave
        for col in keyword_columns:
            if pd.notna(row[col]):
                print(f"  {col}: {row[col]}")

# =============================================================================
# 4. ANÁLISIS DE DOCUMENTOS ASIGNADOS A CADA TÓPICO
# =============================================================================

print("\n" + "="*80)
print("4. ANÁLISIS DE DOCUMENTOS POR TÓPICO")
print("="*80)

# Verificar columna de topic en topic_docs
topic_col = [col for col in topic_docs.columns if 'topic' in col.lower()][0]

# Distribución de documentos
topic_distribution = topic_docs[topic_col].value_counts().sort_index()

print(f"\nDistribución de documentos:")
print(topic_distribution)

# Estadísticas por tópico
print("\n" + "-"*80)
print("ESTADÍSTICAS DETALLADAS POR TÓPICO")
print("-"*80)

for topic_id in sorted(topic_distribution.index):
    if topic_id == -1:
        continue
    
    docs_topic = topic_docs[topic_docs[topic_col] == topic_id]
    
    print(f"\nTópico {topic_id}:")
    print(f"  - Total documentos: {len(docs_topic)}")
    
    # Si hay columnas de fecha/año
    if 'año' in topic_docs.columns or 'year' in topic_docs.columns:
        year_col = 'año' if 'año' in topic_docs.columns else 'year'
        year_dist = docs_topic[year_col].value_counts().sort_index()
        print(f"  - Distribución por año:")
        for year, count in year_dist.items():
            print(f"      {year}: {count} docs")
    
    # Si hay columnas de impacto
    if 'vistas' in topic_docs.columns:
        print(f"  - Vistas promedio: {docs_topic['vistas'].mean():.0f}")
        print(f"  - Vistas máximas: {docs_topic['vistas'].max():.0f}")
    
    if 'descargas' in topic_docs.columns:
        print(f"  - Descargas promedio: {docs_topic['descargas'].mean():.0f}")
        print(f"  - Descargas máximas: {docs_topic['descargas'].max():.0f}")

# =============================================================================
# 5. DOCUMENTOS MÁS REPRESENTATIVOS POR TÓPICO
# =============================================================================

print("\n" + "="*80)
print("5. DOCUMENTOS MÁS REPRESENTATIVOS POR TÓPICO")
print("="*80)

# Si existe columna de probabilidad o distancia
prob_col = [col for col in topic_docs.columns if 'prob' in col.lower() or 'distance' in col.lower()]

for topic_id in sorted(topic_distribution.index):
    if topic_id == -1:
        continue
    
    docs_topic = topic_docs[topic_docs[topic_col] == topic_id].copy()
    
    # Ordenar por probabilidad (si existe)
    if prob_col:
        docs_topic = docs_topic.sort_values(prob_col[0], ascending=False)
    elif 'vistas' in topic_docs.columns:
        docs_topic = docs_topic.sort_values('vistas', ascending=False)
    
    print(f"\n{'='*80}")
    print(f"TÓPICO {topic_id} - Top 5 documentos más representativos")
    print(f"{'='*80}")
    
    # Mostrar top 5
    for idx, (i, row) in enumerate(docs_topic.head(5).iterrows(), 1):
        print(f"\n{idx}. {row.get('titulo', row.get('title', 'Sin título'))}")
        if 'año' in row:
            print(f"   Año: {row['año']}")
        if 'vistas' in row:
            print(f"   Vistas: {row['vistas']}, Descargas: {row.get('descargas', 'N/A')}")
        if prob_col and prob_col[0] in row:
            print(f"   Probabilidad: {row[prob_col[0]]:.3f}")

# =============================================================================
# 6. ANÁLISIS TEMPORAL (SI HAY DATOS DE AÑO)
# =============================================================================

if 'año' in topic_docs.columns or 'year' in topic_docs.columns:
    print("\n" + "="*80)
    print("6. ANÁLISIS TEMPORAL DE TÓPICOS")
    print("="*80)
    
    year_col = 'año' if 'año' in topic_docs.columns else 'year'
    
    # Crear tabla de contingencia
    temporal = pd.crosstab(
        topic_docs[year_col], 
        topic_docs[topic_col],
        margins=True
    )
    
    print("\nTabla de contingencia (Año x Tópico):")
    print(temporal)
    
    # Calcular porcentajes por año
    temporal_pct = pd.crosstab(
        topic_docs[year_col], 
        topic_docs[topic_col],
        normalize='index'
    ) * 100
    
    print("\nDistribución porcentual por año:")
    print(temporal_pct.round(1))
    
    # Identificar tendencias
    print("\n" + "-"*80)
    print("TENDENCIAS TEMPORALES")
    print("-"*80)
    
    for topic_id in sorted(topic_distribution.index):
        if topic_id == -1:
            continue
        
        topic_by_year = temporal_pct[topic_id].dropna()
        
        if len(topic_by_year) > 3:
            # Calcular tendencia simple
            years = topic_by_year.index.astype(float)
            values = topic_by_year.values
            
            # Comparar períodos
            first_period = values[:len(values)//2].mean()
            second_period = values[len(values)//2:].mean()
            
            if second_period > first_period * 1.3:
                trend = "↑ CRECIENTE"
            elif second_period < first_period * 0.7:
                trend = "↓ DECRECIENTE"
            else:
                trend = "→ ESTABLE"
            
            print(f"\nTópico {topic_id}: {trend}")
            print(f"  Primera mitad: {first_period:.1f}%")
            print(f"  Segunda mitad: {second_period:.1f}%")
            print(f"  Cambio: {((second_period/first_period - 1)*100):.1f}%")

# =============================================================================
# 7. ANÁLISIS DE IMPACTO (VISTAS Y DESCARGAS)
# =============================================================================

if 'vistas' in topic_docs.columns:
    print("\n" + "="*80)
    print("7. ANÁLISIS DE IMPACTO POR TÓPICO")
    print("="*80)
    
    impact_stats = topic_docs.groupby(topic_col).agg({
        'vistas': ['mean', 'median', 'max', 'sum'],
        'descargas': ['mean', 'median', 'max', 'sum'] if 'descargas' in topic_docs.columns else []
    }).round(0)
    
    print("\nEstadísticas de impacto por tópico:")
    print(impact_stats)
    
    # Calcular ratio descarga/vista
    if 'descargas' in topic_docs.columns:
        impact_ratio = topic_docs.groupby(topic_col).apply(
            lambda x: x['descargas'].sum() / x['vistas'].sum()
        )
        
        print("\n" + "-"*80)
        print("RATIO DESCARGAS/VISTAS POR TÓPICO")
        print("-"*80)
        for topic_id, ratio in impact_ratio.items():
            if topic_id != -1:
                print(f"Tópico {topic_id}: {ratio:.2f}")
    
    # Identificar documentos de alto impacto por tópico
    print("\n" + "-"*80)
    print("DOCUMENTOS DE ALTO IMPACTO (>400 VISTAS) POR TÓPICO")
    print("-"*80)
    
    high_impact = topic_docs[topic_docs['vistas'] > 400]
    impact_by_topic = high_impact.groupby(topic_col).size()
    
    for topic_id, count in impact_by_topic.items():
        if topic_id != -1:
            pct = (count / len(topic_docs[topic_docs[topic_col] == topic_id])) * 100
            print(f"Tópico {topic_id}: {count} docs ({pct:.1f}%)")

# =============================================================================
# 8. MÉTRICAS DE CALIDAD DEL MODELADO
# =============================================================================

print("\n" + "="*80)
print("8. MÉTRICAS DE CALIDAD DEL MODELADO")
print("="*80)

# Índice de Herfindahl-Hirschman (concentración)
topic_proportions = topic_freq_clean['Count'] / topic_freq_clean['Count'].sum()
hhi = (topic_proportions ** 2).sum()

print(f"\nÍndice de Herfindahl-Hirschman (HHI): {hhi:.3f}")
if hhi > 0.25:
    print("  → Alta concentración: pocos tópicos dominan")
elif hhi > 0.15:
    print("  → Concentración moderada")
else:
    print("  → Baja concentración: distribución equilibrada")

# Entropía (diversidad)
entropy = -(topic_proportions * np.log(topic_proportions)).sum()
max_entropy = np.log(len(topic_proportions))
normalized_entropy = entropy / max_entropy

print(f"\nEntropía normalizada: {normalized_entropy:.3f}")
print(f"  → Diversidad temática: {normalized_entropy*100:.1f}%")

# Cobertura
coverage = ((total_docs - docs_outliers) / total_docs) * 100
print(f"\nCobertura del modelo: {coverage:.1f}%")
print(f"  → {total_docs - docs_outliers} de {total_docs} documentos clasificados")

# =============================================================================
# 9. GUARDAR RESULTADOS PARA EL CAPÍTULO 4
# =============================================================================

print("\n" + "="*80)
print("9. GUARDANDO RESULTADOS")
print("="*80)

# Crear resumen para tablas del capítulo
summary = topic_freq_clean[['Topic', 'Name', 'Count', 'Percentage']].copy()
summary.columns = ['Tópico', 'Etiqueta', 'N° docs', '% corpus']
summary.to_csv('resumen_topicos.csv', index=False, encoding='utf-8-sig')
print("✓ resumen_topicos.csv guardado")

# Crear tabla de impacto por tópico
if 'vistas' in topic_docs.columns:
    impact_table = topic_docs.groupby(topic_col).agg({
        'vistas': 'mean',
        'descargas': 'mean' if 'descargas' in topic_docs.columns else lambda x: 0
    }).round(0)
    impact_table.to_csv('impacto_por_topico.csv', encoding='utf-8-sig')
    print("✓ impacto_por_topico.csv guardado")

# Crear tabla temporal
if 'año' in topic_docs.columns or 'year' in topic_docs.columns:
    temporal_pct.to_csv('evolucion_temporal_topicos.csv', encoding='utf-8-sig')
    print("✓ evolucion_temporal_topicos.csv guardado")

# Top documentos por tópico
for topic_id in sorted(topic_distribution.index):
    if topic_id == -1:
        continue
    
    docs_topic = topic_docs[topic_docs[topic_col] == topic_id].copy()
    
    if prob_col:
        docs_topic = docs_topic.sort_values(prob_col[0], ascending=False)
    elif 'vistas' in topic_docs.columns:
        docs_topic = docs_topic.sort_values('vistas', ascending=False)
    
    top5 = docs_topic.head(5)[['titulo', 'año', 'vistas', 'descargas'] if 'vistas' in docs_topic.columns 
                                else ['titulo', 'año']]
    
    top5.to_csv(f'top5_topico_{topic_id}.csv', index=False, encoding='utf-8-sig')

print(f"✓ Top 5 documentos por cada tópico guardados")

print("\n" + "="*80)
print("✓ ANÁLISIS COMPLETADO")
print("="*80)

FileNotFoundError: [Errno 2] No such file or directory: 'datos\\topic_freq.csv'