In [None]:
import datasets
from datasets import Dataset
from transformers import pipeline
import pandas as pd

In [None]:
pl_small = pipeline("zero-shot-classification", model="Recognai/zeroshot_selectra_small")
pl_medium = pipeline("zero-shot-classification", model="Recognai/zeroshot_selectra_medium")
pl_beto = pipeline("zero-shot-classification", model="Recognai/bert-base-spanish-wwm-cased-xnli")

# mlsum

In [None]:
mlsum = datasets.load_dataset("mlsum", "es", split="test")

In [None]:
mlsum_df = mlsum.to_pandas()

In [None]:
mlsum_df["topic"].value_counts()[:20]

In [None]:
idx = mlsum_df.topic.isin(["politica actualidad", "economia actualidad", "cultura actualidad", "sociedad actualidad", "deportes actualidad"])

In [None]:
data_df = pd.DataFrame({"input": mlsum_df.summary[idx], "label": mlsum_df.topic[idx]})

In [None]:
# Get rid of actualidad
data_df["label"] = data_df.label.map(lambda x: x.split()[0])

In [None]:
# economia -> economía
data_df["label"] = data_df.label.map(lambda x: "economía" if x == "economia" else x) 
# politica -> política
data_df["label"] = data_df.label.map(lambda x: "política" if x == "politica" else x) 

In [None]:
candidate_labels = list(data_df.label.value_counts().index)
candidate_labels

In [None]:
candidate_labels = list(data_df.label.value_counts().index)
template = "Esta noticia es de {}"

def get_predictions(row):
    pred_small = pl_small(row["input"], candidate_labels=candidate_labels,
                       hypothesis_template=template)
    pred_medium = pl_medium(row["input"], candidate_labels=candidate_labels,
                         hypothesis_template=template)
    pred_beto = pl_beto(row["input"], candidate_labels=candidate_labels,
                        hypothesis_template=template)

    return {"small": pred_small, "medium": pred_medium, "beto": pred_beto}

mlsum_pred = Dataset.from_pandas(data_df).map(get_predictions)

In [None]:
mlsum_pred.to_json("mlsum_predictions.json")

In [None]:
data_df = mlsum_pred.to_pandas()

In [None]:
for model in ["small", "medium", "beto"]:
    print("### predictions of", model)
    display(data_df[model].map(lambda x: x["labels"][0]).value_counts())

In [None]:
for model in ["small", "medium", "beto"]:
    print("### accuracy of", model)
    print((data_df[model].map(lambda x: x["labels"][0]) == data_df.label).sum() / len(data_df))