# Dependencies

In [20]:
%load_ext autoreload
%autoreload 2
import nltk
import ipywidgets
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
tqdm.pandas()
# from pandarallel import pandarallel
# pandarallel.initialize(progress_bar=True)
import torch
import datetime
import os
import datasets_handler
import evaluation_metrics
import zeroberto
from zeroberto import ZeroBERTo
# import datasets
# from datasets import Dataset
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer
import gc

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Dataset

In [21]:
# which_dataset = 'folhauol' 
# which_dataset = 'bbc-news'
which_dataset = 'ag_news'

hyp_template = "{}"
# hyp_template = "O tema principal deste texto é {}."
# hyp_template = "this text is about {}."
# hyp_template = "this article is about {}."

raw_data, data_col, class_col = datasets_handler.getDataset(which_dataset)
classes_list = list(raw_data[class_col].unique())
print(classes_list)

Found cached dataset ag_news (/Users/alealcoforado/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

['business', 'science and technology', 'sports', 'world']


# ZeroBERTo

## Parameters

In [22]:
# max_inferences = len(train) #estava em 5000
max_inferences = 5000

# zeroshot_method = "probability_threshold"
probability_goal = 0.9
top_n_goal = 8

zeroshot_method = "dotproduct"  
# zeroshot_method = "kmeans"  

zeroshot_config = {
    'dataset':which_dataset,
    'class_col':class_col,
    'data_col':data_col,
    'split':"zeroshot",
    'method':zeroshot_method,
    'prob_goal':probability_goal,
    'top_n_goal':top_n_goal,
    'max_inferences':max_inferences,
    'classes':classes_list,#list(dict_classes_folha.values())
    'template': hyp_template,
    'random_state':422,
    'trainings_done':0
}
zeroshot_config

{'dataset': 'ag_news',
 'class_col': 'class',
 'data_col': 'text',
 'split': 'zeroshot',
 'method': 'dotproduct',
 'prob_goal': 0.9,
 'top_n_goal': 8,
 'max_inferences': 5000,
 'classes': ['business', 'science and technology', 'sports', 'world'],
 'template': '{}',
 'random_state': 422,
 'trainings_done': 0}

In [23]:
train_data = raw_data.sample(zeroshot_config['max_inferences'],random_state=zeroshot_config['random_state']).sort_index()
len(train_data)

5000

## Model

In [24]:
model = ZeroBERTo(classes_list=zeroshot_config['classes'],hypothesis_template=zeroshot_config['template'],
                  train_dataset=train_data,labeling_method=zeroshot_config['method'])

In [25]:
model.embeddingModel

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [26]:
# model.embeddingModel = new_embeddingModel

## Inference

In [27]:
results = zeroberto.runZeroberto(model,train_data[data_col].to_list(),zeroshot_config)  ##X## COMENTAR AQUI 

Preds: 100  - Total time: 5.1 seconds - ETA: 4.3 minutes
Preds: 200  - Total time: 10.23 seconds - ETA: 4.3 minutes
Preds: 300  - Total time: 15.17 seconds - ETA: 4.2 minutes
Preds: 400  - Total time: 20.46 seconds - ETA: 4.3 minutes
Preds: 500  - Total time: 25.35 seconds - ETA: 4.2 minutes
Preds: 600  - Total time: 30.42 seconds - ETA: 4.2 minutes
Preds: 700  - Total time: 35.48 seconds - ETA: 4.2 minutes
Preds: 800  - Total time: 41.05 seconds - ETA: 4.3 minutes
Preds: 900  - Total time: 46.35 seconds - ETA: 4.3 minutes
Preds: 1000  - Total time: 51.66 seconds - ETA: 4.3 minutes
Preds: 1100  - Total time: 56.8 seconds - ETA: 4.3 minutes
Preds: 1200  - Total time: 61.54 seconds - ETA: 4.3 minutes
Preds: 1300  - Total time: 66.28 seconds - ETA: 4.2 minutes
Preds: 1400  - Total time: 70.99 seconds - ETA: 4.2 minutes
Preds: 1500  - Total time: 75.6 seconds - ETA: 4.2 minutes
Preds: 1600  - Total time: 80.36 seconds - ETA: 4.2 minutes
Preds: 1700  - Total time: 84.9 seconds - ETA: 4.2 mi

## Evaluation

In [28]:
df_results = model.evaluateLabeling(results)

top 1: {'accuracy': 0.75}
top 2: {'accuracy': 0.625}
top 3: {'accuracy': 0.6666666666666666}
top 4: {'accuracy': 0.6875}
top 5: {'accuracy': 0.75}
top 6: {'accuracy': 0.75}
top 7: {'accuracy': 0.75}
top 8: {'accuracy': 0.75}
top 9: {'accuracy': 0.7777777777777778}
top 10: {'accuracy': 0.775}
top 11: {'accuracy': 0.7727272727272727}
top 12: {'accuracy': 0.7916666666666666}
top 13: {'accuracy': 0.8076923076923077}
top 14: {'accuracy': 0.8214285714285714}
top 15: {'accuracy': 0.8166666666666667}
top 16: {'accuracy': 0.8125}
top 5000: {'accuracy': 0.4622}


In [29]:
try: print(zeroshot_metrics)
except: pass
zeroshot_metrics = evaluation_metrics.get_metrics(df_results['prediction_code'].to_list(),df_results['class_code'].to_list())
print(zeroshot_metrics)

{'weighted': [{'accuracy': 0.2566}, {'precision': 0.34315285977983395}, {'recall': 0.2566}, {'f1': 0.1811666095543678}], 'macro': [{'accuracy': 0.2566}, {'precision': 0.23457328628089882}, {'recall': 0.2842195836131744}, {'f1': 0.19852673184153743}]}
{'weighted': [{'accuracy': 0.4622}, {'precision': 0.5792903125778271}, {'recall': 0.4622}, {'f1': 0.43187721978652643}], 'macro': [{'accuracy': 0.4622}, {'precision': 0.5773179133000164}, {'recall': 0.4658197788181281}, {'f1': 0.4330675116801649}]}


## Save Results

In [30]:
save_time = evaluation_metrics.saveZeroshotResults(zeroshot_config,df_results)

predictions_and_probabilities_test_2023_03_09__13_53_41.csv
zeroshot_config_test_2023_03_09__13_53_41.csv


# ST Training

In [31]:
use_zeroshot_previous_step = True
n = 8
top_n = 8
zeroshot_config['top_n'] = top_n
zeroshot_config['n_examples'] = n
zeroshot_config['exec_time'] = save_time

zeroshot_config['st_train_epochs'] = 10
zeroshot_config['st_train_batch_size'] = 20


split = "zeroshot" if use_zeroshot_previous_step else "fewshot"


In [32]:
zeroshot_previous_data = datasets_handler.getZeroshotPreviousData(which_dataset,class_col,top_n=top_n,exec_time=zeroshot_config['exec_time'])
raw_data_final, zeroshot_config['new_class_col'] = datasets_handler.mergeLabelingToDataset(raw_data,zeroshot_previous_data,class_col)

In [33]:
df_train, df_test = datasets_handler.splitDataset(raw_data_final,zeroshot_config)
train_dataset,test_dataset = datasets_handler.buildDatasetDict(df_train,df_test)
train_documents = datasets_handler.splitDocuments(df_train[data_col])
len(train_documents)

54

## Treinamento do Corpo do Modelo -- não descomentar

In [34]:
# model.fit(train_documents,batch_size=zeroshot_config['st_train_batch_size'],epochs=zeroshot_config['st_train_epochs'])

# Fine-Tuning (SetFit)

## Model

In [35]:
zeroshot_config['setfit_model'] = 'sentence-transformers/stsb-xlm-r-multilingual'

setfit_model = SetFitModel.from_pretrained(zeroshot_config['setfit_model'],
                                    use_differentiable_head=True,
                                    head_params={"out_features":len(zeroshot_config['classes'])})

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [36]:
setfit_model.model_body = model.embeddingModel

In [37]:
batch_size = 8
num_text_pairs = 15
num_epochs = 30

zeroshot_config["batch_size"] = batch_size
zeroshot_config["num_pairs"] = num_text_pairs
zeroshot_config["num_epochs"] = num_epochs
# Create trainer
trainer = SetFitTrainer(
    model=setfit_model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss_class=CosineSimilarityLoss,
    batch_size=zeroshot_config["batch_size"],
    num_iterations=zeroshot_config["num_pairs"], # Number of text pairs to generate for contrastive learning
    num_epochs=zeroshot_config["num_epochs"], # Number of epochs to use for contrastive learning
    column_mapping = {data_col: "text", 'class_code': "label"} # NÃO mudar
)
zeroshot_config

{'dataset': 'ag_news',
 'class_col': 'class',
 'data_col': 'text',
 'split': 'zeroshot',
 'method': 'dotproduct',
 'prob_goal': 0.9,
 'top_n_goal': 8,
 'max_inferences': 5000,
 'classes': ['business', 'science and technology', 'sports', 'world'],
 'template': '{}',
 'random_state': 422,
 'trainings_done': 0,
 'top_n': 8,
 'n_examples': 8,
 'exec_time': '2023_03_09__13_53_41',
 'st_train_epochs': 10,
 'st_train_batch_size': 20,
 'new_class_col': 'new_class',
 'setfit_model': 'sentence-transformers/stsb-xlm-r-multilingual',
 'batch_size': 8,
 'num_pairs': 15,
 'num_epochs': 30}

In [38]:
%%time
# Train and evaluate
# trainer.freeze() # Freeze the head
# trainer.train() # Train only the body

# Unfreeze the head and freeze the body -> head-only training
# trainer.unfreeze(keep_body_frozen=True)

# Unfreeze the head and unfreeze the body -> end-to-end training
trainer.unfreeze(keep_body_frozen=False)

trainer.train(
    num_epochs=zeroshot_config["num_epochs"], # The number of epochs to train the head or the whole model (body and head)
    batch_size=zeroshot_config["batch_size"],
    body_learning_rate=1e-5, # The body's learning rate
    learning_rate=1e-2, # The head's learning rate
    l2_weight=0.1, # Weight decay on **both** the body and head. If `None`, will use 0.01.
)

zeroshot_config['trainings_done'] += 1

print(zeroshot_config)

Applying column mapping to training dataset
The `max_length` is `None`. Using the maximum acceptable length according to the current model body: 128.


Epoch:   0%|          | 0/30 [00:00<?, ?it/s]

{'dataset': 'ag_news', 'class_col': 'class', 'data_col': 'text', 'split': 'zeroshot', 'method': 'dotproduct', 'prob_goal': 0.9, 'top_n_goal': 8, 'max_inferences': 5000, 'classes': ['business', 'science and technology', 'sports', 'world'], 'template': '{}', 'random_state': 422, 'trainings_done': 1, 'top_n': 8, 'n_examples': 8, 'exec_time': '2023_03_09__13_53_41', 'st_train_epochs': 10, 'st_train_batch_size': 20, 'new_class_col': 'new_class', 'setfit_model': 'sentence-transformers/stsb-xlm-r-multilingual', 'batch_size': 8, 'num_pairs': 15, 'num_epochs': 30}
CPU times: user 8min 11s, sys: 14min 32s, total: 22min 43s
Wall time: 8min 22s


In [39]:
gc.collect()

y_pred = zeroberto.getPredictions(trainer)

Running predictions on 5000 sentences.


## Evaluation

In [40]:
try: print(setfit_all_metrics)
except: pass
setfit_all_metrics = evaluation_metrics.get_metrics(y_pred ,test_dataset["class_code"])
print(setfit_all_metrics)

{'weighted': [{'accuracy': 0.6106}, {'precision': 0.6488292796466635}, {'recall': 0.6106}, {'f1': 0.5883676759060359}], 'macro': [{'accuracy': 0.6106}, {'precision': 0.6488292796466635}, {'recall': 0.6106}, {'f1': 0.5883676759060359}]}


In [41]:
# new_embeddingModel = trainer.model.model_body

In [42]:
setfit_exec_time  = evaluation_metrics.saveResults(zeroshot_config,setfit_all_metrics)
print(zeroshot_config)

metrics_setfit_2023_03_09__14_05_21.csv
config_setfit_2023_03_09__14_05_21.csv
{'dataset': 'ag_news', 'class_col': 'class', 'data_col': 'text', 'split': 'zeroshot', 'method': 'dotproduct', 'prob_goal': 0.9, 'top_n_goal': 8, 'max_inferences': 5000, 'classes': ['business', 'science and technology', 'sports', 'world'], 'template': '{}', 'random_state': 422, 'trainings_done': 1, 'top_n': 8, 'n_examples': 8, 'exec_time': '2023_03_09__13_53_41', 'st_train_epochs': 10, 'st_train_batch_size': 20, 'new_class_col': 'new_class', 'setfit_model': 'sentence-transformers/stsb-xlm-r-multilingual', 'batch_size': 8, 'num_pairs': 15, 'num_epochs': 30}


In [43]:
model_path = "/Users/alealcoforado/Documents/Projetos/Modelos/{exec}".format(exec=setfit_exec_time)
print(model_path)
trainer.model._save_pretrained(save_directory=model_path)

/Users/alealcoforado/Documents/Projetos/Modelos/2023_03_09__14_05_21


# ZeroBERTo 2nd run

In [None]:
model = ZeroBERTo(classes_list=zeroshot_config['classes'],hypothesis_template=zeroshot_config['template'],
                  train_dataset=train_data,labeling_method=zeroshot_config['method'],embeddingModel=new_embeddingModel)

In [None]:
results = zeroberto.runZeroberto(model,train_data['text'],zeroshot_config)  ##X## COMENTAR AQUI 


Preds: 100  - Total time: 9.34 seconds - ETA: 3.1 minutes
Preds: 200  - Total time: 18.15 seconds - ETA: 3.0 minutes
Preds: 300  - Total time: 27.15 seconds - ETA: 3.0 minutes
Preds: 400  - Total time: 35.55 seconds - ETA: 3.0 minutes
Preds: 500  - Total time: 43.84 seconds - ETA: 2.9 minutes
Preds: 600  - Total time: 52.08 seconds - ETA: 2.9 minutes
Preds: 700  - Total time: 60.36 seconds - ETA: 2.9 minutes
Preds: 800  - Total time: 66.73 seconds - ETA: 2.8 minutes
Preds: 900  - Total time: 72.23 seconds - ETA: 2.7 minutes
Preds: 1000  - Total time: 77.5 seconds - ETA: 2.6 minutes
Preds: 1100  - Total time: 82.88 seconds - ETA: 2.5 minutes
Preds: 1200  - Total time: 94.68 seconds - ETA: 2.6 minutes
Preds: 1300  - Total time: 114.98 seconds - ETA: 2.9 minutes
Preds: 1400  - Total time: 129.69 seconds - ETA: 3.1 minutes
Preds: 1500  - Total time: 142.77 seconds - ETA: 3.2 minutes
Preds: 1600  - Total time: 150.28 seconds - ETA: 3.1 minutes
Preds: 1700  - Total time: 155.36 seconds - ETA

In [None]:
df_results = model.evaluateLabeling(results)
all_metrics = evaluation_metrics.get_metrics(df_results['prediction_code'].to_list(),df_results['class_code'].to_list())
zeroshot_config['trainings_done'] += 1
save_time = evaluation_metrics.saveZeroshotResults(zeroshot_config,df_results)


top 1: {'accuracy': 0.5}
top 2: {'accuracy': 0.75}
top 3: {'accuracy': 0.8333333333333334}
top 4: {'accuracy': 0.75}
top 5: {'accuracy': 0.75}
top 6: {'accuracy': 0.7083333333333334}
top 7: {'accuracy': 0.75}
top 8: {'accuracy': 0.78125}
top 9: {'accuracy': 0.7777777777777778}
top 10: {'accuracy': 0.8}
top 11: {'accuracy': 0.7954545454545454}
top 12: {'accuracy': 0.8125}
top 13: {'accuracy': 0.8269230769230769}
top 14: {'accuracy': 0.8392857142857143}
top 15: {'accuracy': 0.8333333333333334}
top 16: {'accuracy': 0.828125}
top 2000: {'accuracy': 0.5915}
predictions_and_probabilities_test_2023_02_17__07_30_06.csv
zeroshot_config_test_2023_02_17__07_30_06.csv


In [None]:
zeroshot_config['exec_time'] = save_time

zeroshot_previous_data = datasets_handler.getZeroshotPreviousData(which_dataset,class_col,top_n=top_n,exec_time=zeroshot_config['exec_time'])
raw_data_final, zeroshot_config['new_class_col'] = datasets_handler.mergeLabelingToDataset(raw_data,zeroshot_previous_data,class_col)

In [None]:
df_train, df_test = datasets_handler.splitDataset(raw_data_final,zeroshot_config)
train_dataset,test_dataset = datasets_handler.buildDatasetDict(df_train,df_test)

In [None]:
trainer.train_dataset = train_dataset
trainer.test_dataset = test_dataset

trainer.unfreeze(keep_body_frozen=False)

trainer.train(
    num_epochs=zeroshot_config["num_epochs"], # The number of epochs to train the head or the whole model (body and head)
    batch_size=zeroshot_config["batch_size"],
    body_learning_rate=1e-5, # The body's learning rate
    learning_rate=1e-2, # The head's learning rate
    l2_weight=0.1, # Weight decay on **both** the body and head. If `None`, will use 0.01.
)


Applying column mapping to training dataset
The `max_length` is `None`. Using the maximum acceptable length according to the current model body: 128.


Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
gc.collect()

y_pred = zeroberto.getPredictions(trainer)

Running predictions on 2000 sentences.


In [None]:
all_metrics = evaluation_metrics.get_metrics(y_pred ,test_dataset["class_code"])
print(all_metrics)

{'weighted': [{'accuracy': 0.6045}, {'precision': 0.6950742046267486}, {'recall': 0.6045}, {'f1': 0.5852444221212971}], 'macro': [{'accuracy': 0.6045}, {'precision': 0.6950742046267486}, {'recall': 0.6045}, {'f1': 0.5852444221212971}]}
