# Dependencies

In [65]:
%load_ext autoreload
%autoreload 2
import nltk
import ipywidgets
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
tqdm.pandas()
# from pandarallel import pandarallel
# pandarallel.initialize(progress_bar=True)
import torch
import datetime
import os
import datasets_handler
import evaluation_metrics
import zeroberto
from zeroberto import ZeroBERTo
# import datasets
# from datasets import Dataset
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer
import gc

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Dataset

In [3]:
# which_dataset = 'folhauol' 
# which_dataset = 'bbc-news'
which_dataset = 'ag_news'

hyp_template = "{}."
# hyp_template = "O tema principal deste texto é {}."
# hyp_template = "this text is about {}."
# hyp_template = "this article is about {}."

raw_data, data_col, class_col = datasets_handler.getDataset(which_dataset)
classes_list = list(raw_data[class_col].unique())
print(classes_list)

Found cached dataset ag_news (/Users/alealcoforado/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

['business', 'science and technology', 'sports', 'world']


# Parameters

In [27]:
# max_inferences = len(train) #estava em 5000
max_inferences = 3000

# zeroshot_method = "probability_threshold"
probability_goal = 0.9
top_n_goal = 8

zeroshot_method = "dotproduct"  
# zeroshot_method = "kmeans"  

zeroshot_config = {
    'dataset':which_dataset,
    'class_col':class_col,
    'data_col':data_col,
    'split':"zeroshot",
    'method':zeroshot_method,
    'prob_goal':probability_goal,
    'top_n_goal':top_n_goal,
    'max_inferences':max_inferences,
    'classes':classes_list,#list(dict_classes_folha.values())
    'template': hyp_template,
    'random_state':422,
    'trainings_done':0
}
zeroshot_config

{'dataset': 'ag_news',
 'class_col': 'class',
 'data_col': 'text',
 'split': 'zeroshot',
 'method': 'dotproduct',
 'prob_goal': 0.9,
 'top_n_goal': 8,
 'max_inferences': 3000,
 'classes': ['business', 'science and technology', 'sports', 'world'],
 'template': '{}.',
 'random_state': 422,
 'trainings_done': 0}

In [6]:
train_data = raw_data.sample(zeroshot_config['max_inferences'],random_state=zeroshot_config['random_state']).sort_index()
len(train_data)

3000

# Model

In [9]:
model = ZeroBERTo(classes_list=zeroshot_config['classes'],hypothesis_template=zeroshot_config['template'],
                  train_dataset=train_data,labeling_method=zeroshot_config['method'])

In [10]:
results = zeroberto.runZeroberto(model,train_data['text'],zeroshot_config)  ##X## COMENTAR AQUI 


Preds: 50  - Total time: 2.94 seconds - ETA: 2.9 minutes
Preds: 100  - Total time: 5.98 seconds - ETA: 3.0 minutes
Preds: 150  - Total time: 8.95 seconds - ETA: 3.0 minutes
Preds: 200  - Total time: 11.55 seconds - ETA: 2.9 minutes
Preds: 250  - Total time: 14.74 seconds - ETA: 2.9 minutes
Preds: 300  - Total time: 17.42 seconds - ETA: 2.9 minutes
Preds: 350  - Total time: 20.06 seconds - ETA: 2.9 minutes
Preds: 400  - Total time: 22.53 seconds - ETA: 2.8 minutes
Preds: 450  - Total time: 25.25 seconds - ETA: 2.8 minutes
Preds: 500  - Total time: 28.33 seconds - ETA: 2.8 minutes
Preds: 550  - Total time: 31.23 seconds - ETA: 2.8 minutes
Preds: 600  - Total time: 34.14 seconds - ETA: 2.8 minutes
Preds: 650  - Total time: 37.09 seconds - ETA: 2.9 minutes
Preds: 700  - Total time: 39.94 seconds - ETA: 2.9 minutes
Preds: 750  - Total time: 42.82 seconds - ETA: 2.9 minutes
Preds: 800  - Total time: 45.54 seconds - ETA: 2.8 minutes
Preds: 850  - Total time: 48.16 seconds - ETA: 2.8 minutes
P

# Evaluation

In [12]:
df_results = model.evaluateLabeling(results)

top 1: {'accuracy': 0.5}
top 2: {'accuracy': 0.75}
top 3: {'accuracy': 0.75}
top 4: {'accuracy': 0.6875}
top 5: {'accuracy': 0.75}
top 6: {'accuracy': 0.7916666666666666}
top 7: {'accuracy': 0.7857142857142857}
top 8: {'accuracy': 0.78125}
top 9: {'accuracy': 0.8055555555555556}
top 10: {'accuracy': 0.8}
top 11: {'accuracy': 0.7954545454545454}
top 12: {'accuracy': 0.7916666666666666}
top 13: {'accuracy': 0.7884615384615384}
top 14: {'accuracy': 0.7857142857142857}
top 15: {'accuracy': 0.7833333333333333}
top 16: {'accuracy': 0.78125}
top 3000: {'accuracy': 0.4786666666666667}


In [13]:
all_metrics = evaluation_metrics.get_metrics(df_results['prediction_code'].to_list(),df_results['class_code'].to_list())


## Save Results

In [17]:
zeroshot_config['trainings_done'] += 1
save_time = evaluation_metrics.saveZeroshotResults(zeroshot_config,df_results)

predictions_and_probabilities_test_2023_02_17__04_08_05.csv
zeroshot_config_test_2023_02_17__04_08_05.csv


# Fine-Tuning (SetFit)

In [33]:
use_zeroshot_previous_step = True
n = 16
top_n = 16
zeroshot_config['top_n'] = top_n
zeroshot_config['n_examples'] = n
zeroshot_config['exec_time'] = save_time

split = "zeroshot" if use_zeroshot_previous_step else "fewshot"


In [67]:
zeroshot_config

{'dataset': 'ag_news',
 'class_col': 'class',
 'data_col': 'text',
 'split': 'zeroshot',
 'method': 'dotproduct',
 'prob_goal': 0.9,
 'top_n_goal': 8,
 'max_inferences': 3000,
 'classes': ['business', 'science and technology', 'sports', 'world'],
 'template': '{}.',
 'random_state': 422,
 'trainings_done': 0,
 'top_n': 16,
 'n_examples': 16}

In [88]:
zeroshot_previous_data = datasets_handler.getZeroshotPreviousData(which_dataset,class_col,top_n=top_n,exec_time=exec_time)
raw_data_final, zeroshot_config['new_class_col'] = datasets_handler.mergeLabelingToDataset(raw_data,zeroshot_previous_data,class_col)

In [89]:
df_train, df_test = datasets_handler.splitDataset(raw_data_final,zeroshot_config)
train_dataset,test_dataset = datasets_handler.buildDatasetDict(df_train,df_test)

## Model

In [91]:
zeroshot_config['setfit_model'] = 'sentence-transformers/stsb-xlm-r-multilingual'

model = SetFitModel.from_pretrained(zeroshot_config['setfit_model'],
                                    use_differentiable_head=True,
                                    head_params={"out_features":len(zeroshot_config['classes'])})

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [92]:
batch_size = 8
num_text_pairs = 10
num_epochs = 20

zeroshot_config["batch_size"] = batch_size,
zeroshot_config["num_pairs"] = num_text_pairs,
zeroshot_config["num_epochs"] = num_epochs,
# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss_class=CosineSimilarityLoss,
    batch_size=zeroshot_config["batch_size"],
    num_iterations=zeroshot_config["num_pairs"], # Number of text pairs to generate for contrastive learning
    num_epochs=zeroshot_config["num_epochs"], # Number of epochs to use for contrastive learning
    column_mapping = {data_col: "text", class_col: "label"} # NÃO mudar
)
zeroshot_config

{'dataset': 'ag_news',
 'class_col': 'class',
 'data_col': 'text',
 'split': 'zeroshot',
 'method': 'dotproduct',
 'prob_goal': 0.9,
 'top_n_goal': 8,
 'max_inferences': 3000,
 'classes': ['business', 'science and technology', 'sports', 'world'],
 'template': '{}.',
 'random_state': 422,
 'trainings_done': 0,
 'top_n': 16,
 'n_examples': 16,
 'new_class_col': 'new_class',
 'setfit_model': 'sentence-transformers/stsb-xlm-r-multilingual',
 'batch_size': (8,),
 'num_pairs': (10,),
 'num_epochs': (20,)}

In [93]:
%%time
# Train and evaluate
# trainer.freeze() # Freeze the head
# trainer.train() # Train only the body

# Unfreeze the head and freeze the body -> head-only training
# trainer.unfreeze(keep_body_frozen=True)

# Unfreeze the head and unfreeze the body -> end-to-end training
trainer.unfreeze(keep_body_frozen=False)

trainer.train(
    num_epochs=zeroshot_config["num_epochs"], # The number of epochs to train the head or the whole model (body and head)
    batch_size=zeroshot_config["batch_size"],
    body_learning_rate=1e-5, # The body's learning rate
    learning_rate=1e-2, # The head's learning rate
    l2_weight=0.1, # Weight decay on **both** the body and head. If `None`, will use 0.01.
)

print(zeroshot_config)

ValueError: The following columns are missing from the dataset: {'new_class_code'}. Please provide a mapping for all required columns.