# Dependencies

In [1]:
%load_ext autoreload
%autoreload 2
import nltk
import ipywidgets
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
tqdm.pandas()
# from pandarallel import pandarallel
# pandarallel.initialize(progress_bar=True)
import torch
import datetime
import os
import datasets_handler
import evaluation_metrics
import zeroberto
from zeroberto import ZeroBERTo
# import datasets
# from datasets import Dataset
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer
import gc

# Dataset

In [2]:
# which_dataset = 'folhauol' 
# which_dataset = 'bbc-news'
which_dataset = 'ag_news'

hyp_template = "{}."
# hyp_template = "O tema principal deste texto é {}."
# hyp_template = "this text is about {}."
# hyp_template = "this article is about {}."

raw_data, data_col, class_col = datasets_handler.getDataset(which_dataset)
classes_list = list(raw_data[class_col].unique())
print(classes_list)

Found cached dataset ag_news (/Users/alealcoforado/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

['business', 'science and technology', 'sports', 'world']


# ZeroBERTo

In [3]:
num_training_steps = 2

## Parameters

In [4]:
# max_inferences = len(train) #estava em 5000
max_inferences = 3000

# zeroshot_method = "probability_threshold"
probability_goal = 0.9
top_n_goal = 8

use_zeroshot_previous_step = True
n = 8
top_n = 8
zeroshot_method = "dotproduct"

batch_size = 8
num_text_pairs = 10
num_epochs = 20

split = "zeroshot" if use_zeroshot_previous_step else "fewshot"

zeroshot_config = {
    'dataset':which_dataset,
    'class_col':class_col,
    'data_col':data_col,
    'split':"zeroshot",
    'method':zeroshot_method,
    'prob_goal':probability_goal,
    'top_n_goal':top_n_goal,
    'max_inferences':max_inferences,
    'classes':classes_list,#list(dict_classes_folha.values())
    'template': hyp_template,
    'random_state':422,
    'trainings_done':0
}


zeroshot_config["batch_size"] = batch_size
zeroshot_config["num_pairs"] = num_text_pairs
zeroshot_config["num_epochs"] = num_epochs
zeroshot_config['top_n'] = top_n
zeroshot_config['n_examples'] = n


train_data = raw_data.sample(zeroshot_config['max_inferences'],random_state=zeroshot_config['random_state']).sort_index()

zeroberto_model = ZeroBERTo(classes_list=zeroshot_config['classes'],hypothesis_template=zeroshot_config['template'],
                  train_dataset=train_data,labeling_method=zeroshot_config['method'])


In [None]:

for training_step in range(num_training_steps):
    results = zeroberto.runZeroberto(zeroberto_model,train_data['text'],zeroshot_config)  
    zeroshot_config['random_state'] += 1

    df_results = zeroberto_model.evaluateLabeling(results)

    all_metrics = evaluation_metrics.get_metrics(df_results['prediction_code'].to_list(),df_results['class_code'].to_list())
    print(all_metrics)

    gc.collect()

    zeroshot_config['trainings_done'] += 1
    save_time = evaluation_metrics.saveZeroshotResults(zeroshot_config,df_results)

    zeroshot_config['exec_time'] = save_time

    zeroshot_previous_data = datasets_handler.getZeroshotPreviousData(which_dataset,class_col,top_n=top_n,exec_time=zeroshot_config['exec_time'] )
    raw_data_final, zeroshot_config['new_class_col'] = datasets_handler.mergeLabelingToDataset(raw_data,zeroshot_previous_data,class_col)

    df_train, df_test = datasets_handler.splitDataset(raw_data_final,zeroshot_config)
    train_dataset,test_dataset = datasets_handler.buildDatasetDict(df_train,df_test)

    zeroshot_config['setfit_model'] = 'sentence-transformers/stsb-xlm-r-multilingual'

    setfit_model = SetFitModel.from_pretrained(zeroshot_config['setfit_model'],
                                        use_differentiable_head=True,
                                        head_params={"out_features":len(zeroshot_config['classes'])})

    # Create trainer
    trainer = SetFitTrainer(
        model=setfit_model,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        loss_class=CosineSimilarityLoss,
        batch_size=zeroshot_config["batch_size"],
        num_iterations=zeroshot_config["num_pairs"], # Number of text pairs to generate for contrastive learning
        num_epochs=zeroshot_config["num_epochs"], # Number of epochs to use for contrastive learning
        column_mapping = {data_col: "text", 'class_code': "label"} # NÃO mudar
    )

    #### Train and evaluate SetFit Model

    # trainer.freeze() # Freeze the head
    # trainer.train() # Train only the body
    #### Unfreeze the head and freeze the body -> head-only training
    # trainer.unfreeze(keep_body_frozen=True)

    #### Unfreeze the head and unfreeze the body -> end-to-end training
    trainer.unfreeze(keep_body_frozen=False)

    trainer.train(
        num_epochs=zeroshot_config["num_epochs"], # The number of epochs to train the head or the whole model (body and head)
        batch_size=zeroshot_config["batch_size"],
        body_learning_rate=1e-5, # The body's learning rate
        learning_rate=1e-2, # The head's learning rate
        l2_weight=0.01, # Weight decay on **both** the body and head. If `None`, will use 0.01.
    )

    zeroshot_config['trainings_done'] += 1
    gc.collect()

    y_pred = zeroberto.getPredictions(trainer)

    all_metrics = evaluation_metrics.get_metrics(y_pred ,test_dataset["class_code"])
    print(all_metrics)

    new_embeddingModel = trainer.model.model_body

    zeroberto_model.embeddingModel = new_embeddingModel
    evaluation_metrics.saveResults(zeroshot_config,all_metrics)


## Evaluation

In [130]:
df_train, df_test = datasets_handler.splitDataset(raw_data_final,zeroshot_config)
train_dataset,test_dataset = datasets_handler.buildDatasetDict(df_train,df_test)

In [134]:
trainer.train_dataset = train_dataset
trainer.test_dataset = test_dataset

trainer.unfreeze(keep_body_frozen=False)

trainer.train(
    num_epochs=zeroshot_config["num_epochs"], # The number of epochs to train the head or the whole model (body and head)
    batch_size=zeroshot_config["batch_size"],
    body_learning_rate=1e-5, # The body's learning rate
    learning_rate=1e-2, # The head's learning rate
    l2_weight=0.1, # Weight decay on **both** the body and head. If `None`, will use 0.01.
)


Applying column mapping to training dataset
The `max_length` is `None`. Using the maximum acceptable length according to the current model body: 128.


Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

In [135]:
gc.collect()

y_pred = zeroberto.getPredictions(trainer)

Running predictions on 3000 sentences.


In [136]:
all_metrics = evaluation_metrics.get_metrics(y_pred ,test_dataset["class_code"])
print(all_metrics)

{'weighted': [{'accuracy': 0.676}, {'precision': 0.6840411102703291}, {'recall': 0.676}, {'f1': 0.6718740811994882}], 'macro': [{'accuracy': 0.676}, {'precision': 0.6840411102703292}, {'recall': 0.6759999999999999}, {'f1': 0.6718740811994883}]}
