In [None]:
from datasets import load_dataset

dataset = load_dataset("amazon_reviews_multi", "es")

In [None]:
dataset["validation"][0]

# Client setup

In [None]:
from rubric.sdk import Client, AuthenticatedClient
from rubric.sdk.models import * 
from rubric.sdk.api.text_classification import bulk_records, search_records

In [None]:
api_url="http://127.0.0.1:8000"
api_token="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJAcmVjb2duYWkiLCJleHAiOjE2MTQ0NTgzNjl9.PlS29RTTrPMKz0FIWO4Qwk_9U_i1q5ZC_OVHbDqRIaU"

client = Client(base_url=api_url)
client = AuthenticatedClient(
    base_url=client.base_url, 
    token=api_token,
    timeout=10
)

In [None]:
client

# Store labeled dataset for initial exploration


Here we will store the validation dataset with labels, inputs and associated metadata.


In [None]:
def create_record(idx, inputs, label, metadata):
    return TextClassificationRecord.from_dict({
        "idx": idx,
        "inputs": inputs,
        "annotation": {
             "agent": "test",
             "labels": [{"class": label}]
        },
        "metadata": metadata
    })

In [None]:
records = []
for record in dataset['validation']:
    records.append(create_record(
        idx=record["product_id"],
        inputs={
            "review_body": record['review_body'],
            "review_title": record['review_title']
        },
        metadata={
            "product_category": record["product_category"],
            "reviewer_id": record["reviewer_id"]
        },
        label=record["stars"]
    ))

In [None]:
records[0:5]

In [None]:
chunk_size= 1000
for i in range(0, len(records), chunk_size):
    chunk = records[i:i+chunk_size]
    response = bulk_records.sync_detailed(client=client, json_body=TextClassificationRecordsBulk(
        name="amazon_sentiment_es_validation_ds_multifield", 
        tags=TextClassificationRecordsBulkTags.from_dict({ 
            "type":"sentiment classifier",
            "lang": "spanish",
            "description": "Spanish sentiment classifier with `multifield inputs` (title and body)"
        }),
        records=chunk
    ))
    print(response)

In [None]:
response

# Store predictions and labels with a pretrained model for error analysis

Here we will use a pretrained transformer from huggingface Hub for analysing its quality with an unseen dataset

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

CHKPT = 'mrm8488/electricidad-small-finetuned-muchocine'
model = AutoModelForSequenceClassification.from_pretrained(CHKPT)
tokenizer = AutoTokenizer.from_pretrained(CHKPT)

from transformers import pipeline
classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer, return_all_scores=True)

In [None]:
def get_stars(label):
    return len([e for e in label if e != ' '])

In [None]:
[{"class":get_stars(e['label']), "confidence": e['score']} for e in classifier("Una buena película, sin más.")[0]]

In [None]:
def create_record_with_preds(idx, inputs, label, metadata, prediction):
    return TextClassificationRecord.from_dict({
         "idx": idx,
         "inputs": inputs,
         "annotation": {
             "agent": "test",
             "labels": [{"class": label}]
         },
         "prediction": prediction,
         "metadata": metadata
    })

In [None]:
records = []

for record in dataset['validation']:
    records.append(create_record_with_preds(
        idx=record["product_id"],
        inputs={
            "review_body": record['review_body'],
            "review_title": record['review_title']
        },
        metadata={
            "product_category": record["product_category"],
            "reviewer_id": record["reviewer_id"]
        },
        label=record["stars"],
        prediction={
            "agent": CHKPT, 
            "labels": [{"class":get_stars(e['label']), "confidence": e['score']} for e in classifier(record['review_body'])[0]]
        }
       
    ))

In [None]:
len(records)

In [None]:
chunk_size= 200
for i in range(0, len(records), chunk_size):
    chunk = records[i:i+chunk_size]
    response = bulk_records.sync_detailed(client=client, json_body=TextClassificationRecordsBulk(
        name="amazon_sentiment_es_validation_ds_multifield_with_predictions", 
        records=chunk,
        tags=TextClassificationRecordsBulkTags.from_dict({
            "env": "test", 
            "model": CHKPT, 
            "type": "evaluation dataset"
        }),
    ))
    print(response)

# Store only predictions for model monitoring

Here we will use a pretrained transformer from huggingface Hub for monitoring the model predictions over time.

In [None]:
def create_record_only_preds(idx, inputs, metadata, prediction):
    return TextClassificationRecord.from_dict({
         "idx": idx,
         "inputs": inputs,

         "prediction": prediction,
         "metadata": metadata
    })

In [None]:
records = []

for record in dataset['validation']:
    records.append(create_record_only_preds(
        idx=record["product_id"],
        inputs={
            "review_body": record['review_body'],
            "review_title": record['review_title']
        },
        metadata={
            "product_category": record["product_category"],
            "reviewer_id": record["reviewer_id"]
        },
        
        prediction={
            "agent": CHKPT, 
            "labels": [{"class":get_stars(e['label']), "confidence": e['score']} for e in classifier(record['review_body'])[0]]
        }
       
    ))

In [None]:
chunk_size= 200
for i in range(0, len(records), chunk_size):
    chunk = records[i:i+chunk_size]
    response = bulk_records.sync_detailed(client=client, json_body=TextClassificationRecordsBulk(
        name="amazon_sentiment_es_validation_ds_multifield_only_predictions", 
        records=chunk,
        tags=TextClassificationRecordsBulkTags.from_dict({
            "env": "test", 
            "model": CHKPT, 
            "type": "evaluation dataset"
        }),
    ))
    print(response)