# Dependencies

In [1]:
%load_ext autoreload
%autoreload 2
import nltk
import ipywidgets
import pandas as pd

import numpy as np
import time
from tqdm import tqdm
tqdm.pandas()
# from pandarallel import pandarallel
# pandarallel.initialize(progress_bar=True)
import torch
import datetime
import os
import datasets_handler
import evaluation_metrics
import zeroberto
from zeroberto import ZeroBERTo
# import datasets
# from datasets import Dataset
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer
import gc

# Dataset

In [23]:
# which_dataset = 'folhauol' 
# which_dataset = 'bbc-news'
which_dataset = 'ag_news'

hyp_template = "{}."
# hyp_template = "O tema principal deste texto é {}."
# hyp_template = "this text is about {}."
# hyp_template = "this article is about {}."

raw_data, data_col, class_col = datasets_handler.getDataset(which_dataset)
classes_list = list(raw_data[class_col].unique())
print(classes_list)

Found cached dataset ag_news (/Users/alealcoforado/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

['business', 'science and technology', 'sports', 'world']


# ZeroBERTo

In [29]:
num_training_steps = 4

# max_inferences = len(train) #estava em 5000
max_inferences = 500

# zeroshot_method = "probability_threshold"
probability_goal = 0.9
top_n_goal = 8

use_zeroshot_previous_step = True
n = 4
top_n = n
zeroshot_method = "dotproduct"
batch_size = 32
num_text_pairs = 10
num_epochs = 25

## Parameters

In [30]:
split = "zeroshot" if use_zeroshot_previous_step else "fewshot"

zeroshot_config = {
    'dataset':which_dataset,
    'class_col':class_col,
    'data_col':data_col,
    'split':split,
    'method':zeroshot_method,
    'prob_goal':probability_goal,
    'top_n_goal':top_n_goal,
    'max_inferences':max_inferences,
    'classes':classes_list,#list(dict_classes_folha.values())
    'template': hyp_template,
    'random_state':422,
    'trainings_done':0,
    "batch_size": batch_size,
    "num_pairs": num_text_pairs,
    "num_epochs": num_epochs,
    'top_n': top_n,
    'n_examples': n
}


train_data = raw_data.sample(zeroshot_config['max_inferences'],random_state=zeroshot_config['random_state']).sort_index()

zeroberto_model = ZeroBERTo(classes_list=zeroshot_config['classes'],hypothesis_template=zeroshot_config['template'],
                  train_dataset=train_data,labeling_method=zeroshot_config['method'])


In [31]:
for training_step in range(num_training_steps):

    results = zeroberto.runZeroberto(zeroberto_model,train_data['text'],zeroshot_config)  

    df_results = zeroberto_model.evaluateLabeling(results,top_n=8)

    all_metrics = evaluation_metrics.get_metrics(df_results['prediction_code'].to_list(),df_results['class_code'].to_list())
    print("ZeroBERTo metrics:",all_metrics)

    gc.collect()

    save_time = evaluation_metrics.saveZeroshotResults(zeroshot_config,df_results)

    zeroshot_config['exec_time'] = save_time

    zeroshot_previous_data = datasets_handler.getZeroshotPreviousData(which_dataset,class_col,top_n=top_n,exec_time=zeroshot_config['exec_time'] )
    raw_data_final, zeroshot_config['new_class_col'] = datasets_handler.mergeLabelingToDataset(raw_data,zeroshot_previous_data,class_col)

    df_train, df_test = datasets_handler.splitDataset(raw_data_final,zeroshot_config)
    train_dataset,test_dataset = datasets_handler.buildDatasetDict(df_train,df_test)

    if (zeroshot_config['trainings_done'] <= 0):

        zeroshot_config['setfit_model'] = 'sentence-transformers/stsb-xlm-r-multilingual'
        setfit_model = SetFitModel.from_pretrained(zeroshot_config['setfit_model'],
                                            use_differentiable_head=True,
                                            head_params={"out_features":len(zeroshot_config['classes'])})
    else:
        setfit_model.model_body = new_embeddingModel

    # Create trainer
    trainer = SetFitTrainer(
        model=setfit_model,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        loss_class=CosineSimilarityLoss,
        batch_size=zeroshot_config["batch_size"],
        num_iterations=zeroshot_config["num_pairs"], # Number of text pairs to generate for contrastive learning
        num_epochs=zeroshot_config["num_epochs"], # Number of epochs to use for contrastive learning
        column_mapping = {data_col: "text", 'class_code': "label"} # NÃO mudar
    )

    #### Train and evaluate SetFit Model

    # trainer.freeze() # Freeze the head
    # trainer.train() # Train only the body
  
    if (zeroshot_config['trainings_done'] <= 1):
        #### Unfreeze the head and unfreeze the body -> end-to-end training
        trainer.unfreeze(keep_body_frozen=False)
        print("Training Body and Head.")
    else:
        #### Unfreeze the head and freeze the body -> head-only training
        trainer.unfreeze(keep_body_frozen=True)
        zeroshot_config["num_epochs"] = 2
        print("Training only Head.")

    trainer.train(
        num_epochs=zeroshot_config["num_epochs"], # The number of epochs to train the head or the whole model (body and head)
        batch_size=zeroshot_config["batch_size"],
        body_learning_rate=1e-5, # The body's learning rate
        learning_rate=1e-2, # The head's learning rate
        l2_weight=0.01, # Weight decay on **both** the body and head. If `None`, will use 0.01.
    )

    zeroshot_config['trainings_done'] += 1
    gc.collect()

    y_pred = zeroberto.getPredictions(trainer)

    all_metrics = evaluation_metrics.get_metrics(y_pred ,test_dataset["class_code"])
    print("SetFit metrics:",all_metrics)

    new_embeddingModel = trainer.model.model_body

    zeroberto_model.embeddingModel = new_embeddingModel
    evaluation_metrics.saveResults(zeroshot_config,all_metrics)

    zeroshot_config['random_state'] += 1
    zeroberto_model.train_dataset = raw_data.sample(zeroshot_config['max_inferences'],random_state=zeroshot_config['random_state']).sort_index()



Preds: 100  - Total time: 5.23 seconds - ETA: 0.4 minutes
Preds: 200  - Total time: 9.84 seconds - ETA: 0.4 minutes
Preds: 300  - Total time: 14.44 seconds - ETA: 0.4 minutes
Preds: 400  - Total time: 19.19 seconds - ETA: 0.4 minutes
Preds: 500  - Total time: 23.71 seconds - ETA: 0.4 minutes
top 1: {'accuracy': 0.75}
top 2: {'accuracy': 0.75}
top 3: {'accuracy': 0.8333333333333334}
top 4: {'accuracy': 0.8125}
top 5: {'accuracy': 0.8}
top 6: {'accuracy': 0.75}
top 7: {'accuracy': 0.75}
top 8: {'accuracy': 0.71875}
top 500: {'accuracy': 0.472}
ZeroBERTo metrics: {'weighted': [{'accuracy': 0.472}, {'precision': 0.5325776249893898}, {'recall': 0.472}, {'f1': 0.44444732764083916}], 'macro': [{'accuracy': 0.472}, {'precision': 0.5309131228248876}, {'recall': 0.4696565378176432}, {'f1': 0.44160736242716464}]}
predictions_and_probabilities_test_2023_02_17__14_49_58.csv
zeroshot_config_test_2023_02_17__14_49_58.csv


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
The `max_length` is `None`. Using the maximum acceptable length according to the current model body: 128.


Training Body and Head.


Epoch:   0%|          | 0/25 [00:00<?, ?it/s]

Running predictions on 500 sentences.
SetFit metrics: {'weighted': [{'accuracy': 0.634}, {'precision': 0.6624396013351639}, {'recall': 0.634}, {'f1': 0.6401919413193805}], 'macro': [{'accuracy': 0.634}, {'precision': 0.6624396013351638}, {'recall': 0.634}, {'f1': 0.6401919413193804}]}
metrics_setfit_2023_02_17__14_52_57.csv
config_setfit_2023_02_17__14_52_57.csv
Preds: 100  - Total time: 4.87 seconds - ETA: 0.4 minutes
Preds: 200  - Total time: 9.52 seconds - ETA: 0.4 minutes
Preds: 300  - Total time: 14.18 seconds - ETA: 0.4 minutes
Preds: 400  - Total time: 18.85 seconds - ETA: 0.4 minutes
Preds: 500  - Total time: 23.54 seconds - ETA: 0.4 minutes
top 1: {'accuracy': 0.5}
top 2: {'accuracy': 0.5}
top 3: {'accuracy': 0.3333333333333333}
top 4: {'accuracy': 0.25}
top 5: {'accuracy': 0.25}
top 6: {'accuracy': 0.20833333333333334}
top 7: {'accuracy': 0.21428571428571427}
top 8: {'accuracy': 0.1875}
top 500: {'accuracy': 0.23}
ZeroBERTo metrics: {'weighted': [{'accuracy': 0.23}, {'precisi

Applying column mapping to training dataset
The `max_length` is `None`. Using the maximum acceptable length according to the current model body: 128.


Training Body and Head.


Epoch:   0%|          | 0/25 [00:00<?, ?it/s]

Running predictions on 500 sentences.
SetFit metrics: {'weighted': [{'accuracy': 0.396}, {'precision': 0.3881211580565028}, {'recall': 0.396}, {'f1': 0.3275591793757188}], 'macro': [{'accuracy': 0.396}, {'precision': 0.3881211580565029}, {'recall': 0.396}, {'f1': 0.3275591793757189}]}
metrics_setfit_2023_02_17__14_56_39.csv
config_setfit_2023_02_17__14_56_39.csv
Preds: 100  - Total time: 4.81 seconds - ETA: 0.4 minutes
Preds: 200  - Total time: 9.72 seconds - ETA: 0.4 minutes
Preds: 300  - Total time: 14.41 seconds - ETA: 0.4 minutes
Preds: 400  - Total time: 18.96 seconds - ETA: 0.4 minutes
Preds: 500  - Total time: 23.44 seconds - ETA: 0.4 minutes
top 1: {'accuracy': 0.5}
top 2: {'accuracy': 0.5}
top 3: {'accuracy': 0.5}
top 4: {'accuracy': 0.625}
top 5: {'accuracy': 0.5}
top 6: {'accuracy': 0.4583333333333333}
top 7: {'accuracy': 0.42857142857142855}
top 8: {'accuracy': 0.40625}
top 500: {'accuracy': 0.248}
ZeroBERTo metrics: {'weighted': [{'accuracy': 0.248}, {'precision': 0.291060

Applying column mapping to training dataset
The `max_length` is `None`. Using the maximum acceptable length according to the current model body: 128.


Training only Head.


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running predictions on 500 sentences.
SetFit metrics: {'weighted': [{'accuracy': 0.486}, {'precision': 0.45985095555096756}, {'recall': 0.486}, {'f1': 0.4375910111071982}], 'macro': [{'accuracy': 0.486}, {'precision': 0.45985095555096756}, {'recall': 0.486}, {'f1': 0.43759101110719817}]}
metrics_setfit_2023_02_17__14_57_58.csv
config_setfit_2023_02_17__14_57_58.csv
Preds: 100  - Total time: 4.68 seconds - ETA: 0.4 minutes
Preds: 200  - Total time: 9.17 seconds - ETA: 0.4 minutes
Preds: 300  - Total time: 13.67 seconds - ETA: 0.4 minutes
Preds: 400  - Total time: 18.2 seconds - ETA: 0.4 minutes
Preds: 500  - Total time: 22.61 seconds - ETA: 0.4 minutes
top 1: {'accuracy': 0.0}
top 2: {'accuracy': 0.0}
top 3: {'accuracy': 0.0}
top 4: {'accuracy': 0.0625}
top 5: {'accuracy': 0.1}
top 6: {'accuracy': 0.08333333333333333}
top 7: {'accuracy': 0.14285714285714285}
top 8: {'accuracy': 0.1875}
top 500: {'accuracy': 0.234}
ZeroBERTo metrics: {'weighted': [{'accuracy': 0.234}, {'precision': 0.217

Applying column mapping to training dataset
The `max_length` is `None`. Using the maximum acceptable length according to the current model body: 128.


Training only Head.


Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running predictions on 500 sentences.
SetFit metrics: {'weighted': [{'accuracy': 0.438}, {'precision': 0.4783474950920889}, {'recall': 0.438}, {'f1': 0.3854931891783714}], 'macro': [{'accuracy': 0.438}, {'precision': 0.4783474950920889}, {'recall': 0.43799999999999994}, {'f1': 0.38549318917837133}]}
metrics_setfit_2023_02_17__14_59_23.csv
config_setfit_2023_02_17__14_59_23.csv


: 