# **Caso práctico 2: Análisis de noticias**

## 1. Análisis de sentimiento

In [None]:
from datasets import load_dataset

from transformers import pipeline

classifier = pipeline(
    "text-classification", 
    model="finiteautomata/beto-sentiment-analysis", 
    return_all_scores=True,
    truncation=True,
    padding=True
)

classifier = rb.monitor(classifier, dataset="noticias_en_es", sample_rate=1.0)

dataset = load_dataset("mlsum", "es", split="test[0:500]")

In [None]:
dataset.map(
    lambda r: {"prediction": classifier(r["summary"])},
    batched=True,
    batch_size=32
)

## 2. Categorización de texto (zero-shot)

In [None]:
classifier = pipeline("zero-shot-classification", 
                       model="Recognai/zeroshot_selectra_medium")

labels = ["política", "cultura", "sociedad", "economia", "deportes", "ciencia y tecnología"]
template = "Esta noticia habla de {}."

In [None]:
classifier = rb.monitor(classifier, dataset="noticias_en_es_categorias", sample_rate=1.0)

In [None]:
dataset.map(
    lambda r: {"prediction": classifier(r["summary"], candidate_labels=labels, hypothesis_template=template)},
    batched=True, 
    batch_size=8
)

## 3. Categorización de texto (etiquetado programático usando weak supervision)

![Labeling workflow](https://raw.githubusercontent.com/recognai/rubrix-materials/main/tutorials/weak_supervision/weak_supervision.svg "Labeling workflow")

In [None]:
import pandas as pd

crisol = pd.read_csv("datos/crisol.csv", sep=";", names=["termino", "polaridad"]) ; crisol.head()

In [None]:
crisol.polaridad.hist()

In [None]:
crisol = crisol.sample(frac=1.0,)

In [None]:
from datasets import load_dataset

noticias = load_dataset("mlsum", "es", split="train[0:10000]")
records = [
    rb.TextClassificationRecord(
        inputs=noticia["summary"],
    )
    for noticia in noticias
] ; len(records)

In [None]:
rb.delete(name="noticias_en_es_weak_supervision")
rb.log(records, name="noticias_en_es_weak_supervision")

In [None]:
from rubrix.labeling.text_classification import Rule, WeakLabels

terminos_positivos = crisol.query("polaridad == 'positive'")[0:1000].values
terminos_negativos = crisol.query("polaridad == 'negative'")[0:2000].values

In [None]:
rules = []

for termino, polaridad in terminos_positivos:
    rules.append(
        Rule(query=f"{termino}", label="Positivo")
    )
    
for termino, polaridad in terminos_negativos:
    rules.append(
        Rule(query=f"{termino}", label="Negativo")
    )

In [None]:
weak_labels = WeakLabels(rules=rules, dataset="noticias_en_es_weak_supervision")

In [None]:
weak_labels.summary()

In [None]:
from snorkel.labeling.model import LabelModel

# train our label model
label_model = LabelModel()
label_model.fit(L_train=weak_labels.matrix(has_annotation=False))

# check its performance
#label_model.score(L=weak_labels.matrix(has_annotation=True), Y=weak_labels.annotation())

In [None]:
# Get the part of the weak label matrix that has no corresponding annotation
train_matrix = weak_labels.matrix(has_annotation=False)

# Get predictions from our label model
predictions = label_model.predict_proba(L=train_matrix)
predicted_labels = label_model.predict(L=train_matrix)

preds = [[('Positivo', pred[0]), ('Negativo', pred[1])] for pred in predictions]

# Get the records that do not have an annotation
train_records = weak_labels.records(has_annotation=False)

# Add the predictions to the records
def add_prediction(record, prediction):
    record.prediction = prediction
    return record

train_records_with_lm_prediction = [
    add_prediction(rec, pred)
    for rec, pred, label in zip(train_records, preds, predicted_labels)
    if label != weak_labels.label2int[None] # exclude records where the label model abstains
]

# Log a new dataset to Rubrix
rb.delete(name="noticias_en_es_snorkel")
rb.log(train_records_with_lm_prediction, name="noticias_en_es_snorkel")

## 4. Reconocimiento de entidades

In [None]:
import spacy
import rubrix as rb

from datasets import load_dataset

nlp = spacy.load("es_core_news_md")
nlp = rb.monitor(nlp, dataset="noticias_en_es_ner", sample_rate=1.0)

dataset = load_dataset("mlsum", "es", split="test[10000:]")

In [None]:
def extract_entities(record):
    docs = nlp.pipe(record["summary"])
    return {"processed": [doc.text for doc in docs]}

dataset.map(
    extract_entities,
    batched=True, 
    batch_size=32
)

In [None]:
from rubrix.metrics.token_classification import *

In [None]:
entity_consistency(name="noticias_en_es_ner", mentions=100).visualize()

In [None]:
entity_labels(name="noticias_en_es_ner").visualize()