# Preprocessing: modeling dataset as multilabel

In [None]:
import pandas as pd

In [None]:
!pip install biome-text

In [None]:
df = pd.read_csv('datasets/miso_training_ds.csv') ; df

In [None]:
def preprocess(df):
    preproc_ds = []
    for i,r in df.iterrows():
        if r.label == 1:
            preproc_ds.append({
                'id': r.id,
                'text': r.text,
                'label': [r.misogyny_category, r.target]
            })
        else:
            preproc_ds.append({
                'id': r.id,
                'text': r.text,
                'label': []
        })
    preproc_ds
    return pd.DataFrame(preproc_ds)      

In [None]:
train_df = preprocess(df) 

In [None]:
validation_df =  preprocess(pd.read_csv('datasets/validation_ds.csv')) ; validation_df

# Train baseline multilabel

In [None]:
from biome.text import *

In [None]:
train_ds = Dataset.from_pandas(train_df)
validation_ds = Dataset.from_pandas(validation_df)

In [None]:
pipeline = Pipeline.from_config({
    "name": "multilabel",
    "head": {
        "type": "TextClassification",
        "multilabel": True,
        "labels": [
            'sexual_harassment',
             'dominance',
             'discredit',
             'stereotype',
             'derailing',
             'passive',
             'active'
        ]
        
    }
})

In [None]:
pipeline.predict(text="El mal querer by Rosalia")

In [None]:
pipeline.train(training=train_ds, validation=validation_ds, output="baseline_myso_clas")

In [None]:
pipeline.predict(text="Rosalia a fregar")

# Exploring training data in rubrix (new biome app + API)

This only a prototype for how a python wrapper could look like

In [None]:
from _rubrix import rubrix
from rubrix.sdk.models import * 
api_key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJAcmVjb2duYWkiLCJleHAiOjE2MTQ0NTgzNjl9.PlS29RTTrPMKz0FIWO4Qwk_9U_i1q5ZC_OVHbDqRIaU"

In [None]:
rubrix.init(api_key)

In [None]:
records = []
for i,r in df.iterrows():
    record = TextClassificationRecord.from_dict({
        "id":  r.id,
        "inputs": {"text": r.text},
        "multi_label": True
     })
    if len(r.label) > 0:
        record["annotation"] = {
             "agent": "dvilasuero",
             "labels": [{"class": label} for label in r.label ],
             
         }
    records.append(record)

In [None]:
rubrix.log(records, dataset="es_multilabel_mysogyny_train")

# Exploring predictions overdf = pd.read_csv('datasets/miso_training_ds.csv') ; df validation data in rubrix (new biome app + API)

In [None]:
df = pd.read_csv('datasets/validation_ds.csv') ; df

In [None]:
pipeline_classifier = Pipeline.from_pretrained('baseline_myso_clas')

In [None]:
records = []
for i,r in validation_df.iterrows():
    record = TextClassificationRecord.from_dict({
        "id":  r.id,
        "inputs": {"text": r.text},
        "multi_label": True
     })
    if len(r.label) > 0:
        record["annotation"] = {
             "agent": "dvilasuero",
             "labels": [{"class": label} for label in r.label ],
             
         }
    # Store predictions together with true labels
    preds = pipeline_classifier.predict(text=r.text)
    record["prediction"] = {
            "agent": pipeline_classifier.name, 
            "labels": [{"class": cls, "confidence": prob} for cls, prob in zip(preds['labels'],preds['probabilities'])]
    }
    records.append(record)

In [None]:
rubrix.log(records, dataset="es_multilabel_mysogyny_val_with_predictions")

# Appendix: same with raw Python cli SDK

In [None]:
from rubrix.sdk.client import Client, AuthenticatedClient
from rubrix.sdk.models import * 
from rubrix.sdk.api.text_classification import bulk_records, search_records

In [None]:
client = Client(base_url="https://observe-dev.biome.recogn.ai")
client = AuthenticatedClient(
    base_url=client.base_url, 
    token="eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiJAcmVjb2duYWkiLCJleHAiOjE2MTQ0NTgzNjl9.PlS29RTTrPMKz0FIWO4Qwk_9U_i1q5ZC_OVHbDqRIaU",
    timeout=10
)

In [None]:
chunk_size= 1000
for i in range(0, len(records), chunk_size):
    chunk = records[i:i+chunk_size]
    response = bulk_records.sync_detailed(client=client, json_body=TextClassificationRecordsBulk(
        name="test_miso", 
        tags=TextClassificationRecordsBulkTags.from_dict({ 
            "type":"classifier",
            "lang": "spanish",
            "description": "Spanish sentiment classifier with `multifield inputs` (title and body)"
        }),
        records=chunk
    ))
    print(response)