# 1. Multilabel classifier

## Data and categories 

In [None]:
import pandas as pd

In [None]:
df = pd.read_json('datasets/es_sum_mini.json', lines=True, orient='records') ; df.head()

In [None]:
categories = ['positivo', 'negativo', 'economía', 'electricidad', 'telecomunicaciones', 'ecología', 'política', 'energía']

## Pretrained zero-shot

In [None]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="typeform/squeezebert-mnli") # too big: "joeddav/xlm-roberta-large-xnli")

In [None]:
classifier("A ERC y Crida per Sabadell (CUP), que hasta ahora..", candidate_labels=categories, multi_class=True)

## Log predictions in Rubric

In [1]:
from _rubric import rubric
from rubric.sdk.models import * 

In [None]:
api_key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJAcmVjb2duYWkiLCJleHAiOjE2MTQ0NTgzNjl9.PlS29RTTrPMKz0FIWO4Qwk_9U_i1q5ZC_OVHbDqRIaU"

In [None]:
rubric.init(api_key)

In [None]:
for i,r in df[0:100].iterrows():
    # zero-shot prediction
    preds = classifier(r.summary, candidate_labels=categories, multi_class=True)
    item = TextClassificationRecord.from_dict({
         "inputs": {"text": r.summary}, 
         "prediction": {
             "agent": "dvilasuero",
             "labels": [{"class": cls, "confidence": score} for cls, score in zip(preds['labels'],preds['scores'])]
         },
         "multi_label": True, # huggingface calls this multiclass, I don't know why
         "event_timestamp": r['date'].isoformat(),
         "metadata": {'model': 'typeform/squeezebert-mnli'}
     })
    # log one record each time
    rubric.log(records=[item], dataset="red_electrica_multilabel")

# 2. Entity classifier

## spaCy pretrained model

Not the best in town

In [None]:
import spacy
nlp = spacy.load('es')

In [None]:
doc = nlp('Esto es una prueba sobre Mariano Rajoy, ex-presidente del PP, la loca de Pontevedra')
for e in doc.ents:
    print(e.start_char, e.end_char, e.label_)

## Log predictions in Rubric

In [None]:
for i,r in df[0:100].iterrows():
    doc = nlp(r['summary'])
    entities = [
        {'start': e.start_char, 
         'end': e.end_char, 
         'start_token': e.start, 
         'end_token': e.end,
         'label': e.label_
        }
        for e in doc.ents
    ]
    record = TokenClassificationRecord.from_dict({
         "raw_text": r['summary'], 
         "prediction": {
             "agent": "spacy_v2",
             "entities": entities
         },
        "tokens": [t.text for t in doc],
         "event_timestamp": r['date'].isoformat(),
         "metadata": {'model': 'spacy_es_core_news_sm'}
     })
    rubric.log([record], dataset="red_electrica_entities", task="ner")    