# Análisis de texto

In [None]:
%pip install pandas matplotlib scikit-learn spacy gdeltdoc pysentimiento unidecode wordcloud
%python -m spacy download es_core_news_md

## Descarga de titulares con GDELT (últimos 30 días)

In [None]:
import pandas as pd, datetime as dt
from gdeltdoc import Filters, GdeltDoc  # Cliente oficial de la comunidad
# Referencia: https://github.com/alex9smith/gdelt-doc-api  (instalación/uso)  # :contentReference[oaicite:12]{index=12}

gd = GdeltDoc()

PAISES = {
    "Bolivia": ["Bolivia", "La Paz", "Cochabamba", "Santa Cruz"],
    "Argentina": ["Argentina", "Buenos Aires", "Córdoba"],
    "Perú": ["Perú", "Lima", "Cusco"]
}

def consulta_pais(pais, terminos, dias=30, maxrecords=250):
    f = Filters(
        keyword=" OR ".join(terminos),
        timelimit=f"{dias}d",
        numrecords=maxrecords,
        mode="ArtList",
        translation="es"  # prioriza español
    )
    df = gd.article_search(f)
    if df is None or df.empty:
        return pd.DataFrame()
    df["pais_consulta"] = pais
    return df

dfs = [consulta_pais(p, ts) for p, ts in PAISES.items()]
raw = pd.concat([d for d in dfs if not d.empty], ignore_index=True).drop_duplicates(subset=["url"])
raw.head(3)


## Limpieza y pipeline de spaCy (lematización ES)

In [None]:
import spacy, re
from unidecode import unidecode
nlp = spacy.load("es_core_news_md")  # :contentReference[oaicite:14]{index=14}

def limpiar(t):
    t = re.sub(r"http\\S+|www\\.\\S+", " ", t)
    t = re.sub(r"[@#]\\w+", " ", t)
    t = re.sub(r"\\s+", " ", t).strip()
    return t

def normalizar_es(t):
    t = t.lower()
    t = limpiar(t)
    t = unidecode(t)  # opcional: quitar acentos para vocabularios simples
    return t

raw["texto"] = (raw["title"]  # o 'seendate'/'language' según columnas disponibles del cliente
                .fillna("")
                .map(normalizar_es))

def lemas_es(texto):
    doc = nlp(texto)
    return [tok.lemma_ for tok in doc 
            if not tok.is_stop and not tok.is_punct and tok.is_alpha and len(tok) > 2]

raw["lemmas"] = raw["texto"].map(lemas_es)
raw[["pais_consulta","texto","lemmas"]].head(3)


## Frecuencias y n-gramas (scikit-learn)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  # :contentReference[oaicite:15]{index=15}
def top_tokens(df, n=15, ngram=(1,1)):
    corpus = df["texto"].tolist()
    vec = CountVectorizer(ngram_range=ngram, min_df=2)
    X = vec.fit_transform(corpus)
    freqs = X.sum(axis=0).A1
    vocab = vec.get_feature_names_out()
    out = (pd.DataFrame({"token": vocab, "freq": freqs})
             .sort_values("freq", ascending=False)
             .head(n))
    return out

top_por_pais = {}
for p in raw["pais_consulta"].unique():
    top_por_pais[p] = top_tokens(raw.query("pais_consulta == @p"), n=15, ngram=(1,2))
top_por_pais["Bolivia"].head(10)


## Sentimiento (dos enfoques)

### Diccionario mínimo (reglas transparentes)

In [None]:
LEX_POS = {"progreso","exito","crecer","bueno","positivo","mejora","ganar","feliz","beneficio","lider"}
LEX_NEG = {"crisis","corrupcion","malo","negativo","caida","violencia","perdida","protesta","denuncia","riesgo"}

def score_lex(texto):
    toks = set(texto.split())
    return len(toks & LEX_POS) - len(toks & LEX_NEG)

raw["sent_lex"] = raw["texto"].map(score_lex)
raw.groupby("pais_consulta")["sent_lex"].mean().round(3)


### Modelo ML para español con pysentimiento

In [None]:
from pysentimiento import create_analyzer  # :contentReference[oaicite:17]{index=17}
analyzer = create_analyzer(task="sentiment", lang="es")  # descarga modelo BETO/roBERTuito

def sent_label(texto):
    out = analyzer.predict(texto)
    return out.output  # "POS", "NEG", "NEU"

raw["sent_ml"] = raw["texto"].map(sent_label)
sent_resumen = (raw.groupby(["pais_consulta","sent_ml"])
                   .size().unstack(fill_value=0))
sent_resumen


## Topic modelling (LDA con scikit-learn)

In [None]:
from sklearn.decomposition import LatentDirichletAllocation  # LDA clásico  # :contentReference[oaicite:18]{index=18}
from sklearn.feature_extraction.text import TfidfVectorizer

# Usar TF o TF-IDF con n-gramas; para LDA suele preferirse conteos (CountVectorizer)
vec = CountVectorizer(min_df=3, max_df=0.9, ngram_range=(1,2))
X = vec.fit_transform(raw["texto"])
lda = LatentDirichletAllocation(n_components=8, learning_method="batch", random_state=42)
lda.fit(X)

def mostrar_top_palabras(modelo, vocab, n_top=10):
    for i, comp in enumerate(modelo.components_):
        top_ids = comp.argsort()[-n_top:][::-1]
        palabras = [vocab[t] for t in top_ids]
        print(f"Tópico {i:02d}: ", ", ".join(palabras))

mostrar_top_palabras(lda, vec.get_feature_names_out(), n_top=10)
