# **Análisis exploratorio de datos**

In [43]:
## Librerías a utilizar
import polars as pl
import spacy
import re

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

nlp = spacy.load("en_core_web_sm")
analyzer = SentimentIntensityAnalyzer()

In [55]:
## Ruta de la data cruda
ruta_gen = "/Users/arturohernandezlopez/Documents/git/demo_sentiment_analysis/"
ruta_data_raw = ruta_gen + "data-raw/sentimientos_simulados_comentarios.csv"
data_raw = pl.read_csv(ruta_data_raw)
data_raw.head()

empresa,producto,sentimiento,comentario,fecha_hora
str,str,str,str,str
"""Apple""","""MacBook Air""","""neutral""","""MacBook Air cumple con lo bási…","""2025-04-16 13:00:00"""
"""Apple""","""MacBook Air""","""negativo""","""He tenido más problemas que be…","""2025-04-19 11:00:00"""
"""Nvidia""","""GeForce NOW""","""negativo""","""Malísimo el GeForce NOW de Nvi…","""2025-04-16 23:00:00"""
"""Nike""","""Air Max""","""neutral""","""Vi muchas reseñas del Air Max …","""2025-04-19 08:00:00"""
"""Apple""","""MacBook Air""","""positivo""","""No puedo creer lo bien que fun…","""2025-04-16 17:00:00"""


In [45]:
data_raw.describe()

statistic,empresa,producto,sentimiento,comentario,fecha_hora
str,str,str,str,str,str
"""count""","""1200""","""1200""","""1200""","""1200""","""1200"""
"""null_count""","""0""","""0""","""0""","""0""","""0"""
"""mean""",,,,,
"""std""",,,,,
"""min""","""Apple""","""Air Max""","""negativo""","""Air Max cumple con lo básico. …","""2025-04-15 08:00:00"""
"""25%""",,,,,
"""50%""",,,,,
"""75%""",,,,,
"""max""","""Tesla""","""iPhone 15""","""positivo""","""¡Amo mi nuevo iPhone 15 de App…","""2025-04-20 08:00:00"""


In [46]:
def clean_text(text):
    if not text:
        return ""
    # Limpieza general
    text = re.sub(r"http\S+", "", text)  # eliminar URLs
    text = re.sub(r"@\w+", "", text)     # eliminar menciones
    text = re.sub(r"#", "", text)        # eliminar hashtags
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)  # eliminar puntuación
    text = text.lower()
    return text.strip()

def normalize_text(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return " ".join(tokens)

def analyze_sentiment(text):
    if not text.strip():
        return ("neutral", 0.0)
    score = analyzer.polarity_scores(text)["compound"]
    if score >= 0.05:
        return ("positivo", score)
    elif score <= -0.05:
        return ("negativo", score)
    else:
        return ("neutral", score)



In [None]:
# Limpieza de texto
data = data_raw.with_columns([
    pl.col("comentario").map_elements(clean_text).alias("clean_comentario")
])

In [None]:
# Normalización con spaCy
data = data.with_columns([
    pl.col("clean_comentario").map_elements(normalize_text).alias("texto_limpio")
])

In [None]:
# Análisis de sentimientos
data = data.with_columns([
    pl.col("texto_limpio").map_elements(lambda t: analyze_sentiment(t)[0]).alias("sentimiento"),
    pl.col("texto_limpio").map_elements(lambda t: analyze_sentiment(t)[1]).alias("score_sentimiento")
])

In [52]:
# Convertir timestamp y redondear a la hora
data = data.with_columns([
    pl.col("fecha_hora").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S").alias("hora")
])

data = data.with_columns([
    pl.col("hora").dt.truncate("1h").alias("hora_redondeada")
])

In [53]:
data.head()

empresa,producto,sentimiento,comentario,fecha_hora,clean_comentario,texto_limpio,score_sentimiento,hora,hora_redondeada
str,str,str,str,str,str,str,f64,datetime[μs],datetime[μs]
"""Apple""","""MacBook Air""","""neutral""","""MacBook Air cumple con lo bási…","""2025-04-16 13:00:00""","""macbook air cumple con lo bsic…","""macbook air cumple con lo bsic…",0.0,2025-04-16 13:00:00,2025-04-16 13:00:00
"""Apple""","""MacBook Air""","""neutral""","""He tenido más problemas que be…","""2025-04-19 11:00:00""","""he tenido ms problemas que ben…","""tenido ms problemas que benefi…",0.0,2025-04-19 11:00:00,2025-04-19 11:00:00
"""Nvidia""","""GeForce NOW""","""neutral""","""Malísimo el GeForce NOW de Nvi…","""2025-04-16 23:00:00""","""malsimo el geforce now de nvid…","""malsimo el geforce de nvidia l…",0.0,2025-04-16 23:00:00,2025-04-16 23:00:00
"""Nike""","""Air Max""","""neutral""","""Vi muchas reseñas del Air Max …","""2025-04-19 08:00:00""","""vi muchas reseas del air max d…","""vi muchas reseas del air max d…",0.0,2025-04-19 08:00:00,2025-04-19 08:00:00
"""Apple""","""MacBook Air""","""neutral""","""No puedo creer lo bien que fun…","""2025-04-16 17:00:00""","""no puedo creer lo bien que fun…","""puedo creer lo bien que funcio…",0.0,2025-04-16 17:00:00,2025-04-16 17:00:00


In [56]:
## Exportar base limpia
ruta_exp = ruta_gen + "data/data_comentarios_clean.csv"
data.write_csv(ruta_exp)

## **Agregaciones clave que alimentarán los gráficos**

In [58]:
# 1. Total de menciones por empresa
menciones_empresa = data.group_by("empresa").agg([
    pl.count().alias("total_menciones"),
    pl.mean("score_sentimiento").alias("sentimiento_promedio")
]).sort("total_menciones", descending=True)


menciones_empresa

  pl.count().alias("total_menciones"),


empresa,total_menciones,sentimiento_promedio
str,u32,f64
"""Nike""",192,-0.006013
"""Binance""",179,0.092068
"""Coinbase""",178,-0.014808
"""Apple""",174,-0.019975
"""Nvidia""",173,-0.028218
"""Meta""",166,-0.019355
"""Tesla""",138,-0.011841


In [59]:
# 2. Menciones por hora
menciones_hora = data.group_by("hora_redondeada").agg([
    pl.count().alias("total_menciones"),
    pl.mean("score_sentimiento").alias("sentimiento_promedio")
])

menciones_hora

  pl.count().alias("total_menciones"),


hora_redondeada,total_menciones,sentimiento_promedio
datetime[μs],u32,f64
2025-04-17 19:00:00,11,0.0
2025-04-19 19:00:00,12,-0.006433
2025-04-17 06:00:00,8,0.0
2025-04-15 10:00:00,20,-0.035105
2025-04-18 05:00:00,12,0.028333
…,…,…
2025-04-16 05:00:00,15,0.01752
2025-04-17 14:00:00,10,0.034
2025-04-20 07:00:00,7,0.0
2025-04-16 19:00:00,13,0.0


In [60]:
# 3. Comparativo de productos
comparativo_producto = data.group_by("producto").agg([
    pl.count().alias("total_menciones"),
    pl.mean("score_sentimiento").alias("sentimiento_promedio")
])

comparativo_producto

  pl.count().alias("total_menciones"),


producto,total_menciones,sentimiento_promedio
str,u32,f64
"""MacBook Air""",39,0.008718
"""Autopilot""",34,-0.022921
"""Powerwall""",32,-0.017806
"""Model 3""",41,0.008293
"""Facebook Ads""",41,-0.056439
…,…,…
"""Nike Run Club""",51,-0.016794
"""Earn""",45,-0.022424
"""Apple Watch""",48,-0.0466
"""Nike App""",46,-0.004996


In [61]:
# 4. Sentimiento por empresa
sentimiento_empresa = data.group_by(["empresa", "sentimiento"]).agg([
    pl.count().alias("conteo")
])
sentimiento_empresa

  pl.count().alias("conteo")


empresa,sentimiento,conteo
str,str,u32
"""Nike""","""neutral""",173
"""Nvidia""","""positivo""",4
"""Nike""","""negativo""",13
"""Nvidia""","""negativo""",17
"""Tesla""","""neutral""",129
…,…,…
"""Apple""","""neutral""",155
"""Tesla""","""negativo""",6
"""Tesla""","""positivo""",3
"""Apple""","""negativo""",12


In [62]:
# 5. Top comentarios positivos y negativos
top_positivos = data.sort("score_sentimiento", descending=True).select(["empresa", "comentario", "score_sentimiento"]).head(10)
top_negativos = data.sort("score_sentimiento").select(["empresa", "comentario", "score_sentimiento"]).head(10)

In [68]:
## Rutas para exportar
ruta_men_emp = ruta_gen + "data/menciones_por_empresa.csv"
ruta_men_hor = ruta_gen + "data/sentimiento_por_hora.csv"
ruta_con_prod = ruta_gen + "data/conteo_sentimientos.csv"
ruta_sen_emp = ruta_gen + "data/sentimiento_empresa.csv"

## Exportar archivos
menciones_empresa.write_csv(ruta_men_emp) # 1
menciones_hora.write_csv(ruta_men_hor) # 2
comparativo_producto.write_csv(ruta_con_prod) # 3
sentimiento_empresa.write_csv(ruta_sen_emp) # 4 