# === ZeroBERTo Fit ===

# Dependencies

In [134]:
# !pip install setfit
# !pip install datasets
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [135]:
# from google.colab import drive
# import pandas as pd

# drive.mount('/content/drive')

In [136]:
import pandas as pd
import datasets_handler
import datasets
from datasets import Dataset
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer, sample_dataset
import evaluation_metrics
import gc
import zeroberto
from zeroberto import ZeroBERTo
tqdm.pandas()


# Data Prep

In [137]:
use_zeroshot_previous_step = True
exec_time = '2023_02_17__04_03_23'

# which_dataset = 'folhauol'
# which_dataset = 'bbc-news'
which_dataset = 'ag_news'


split = "zeroshot" if use_zeroshot_previous_step else "fewshot"
n = 16
test_dataset_sample_size = 1/32
## 1/64 ---> <6 min
## 1/32 ---> <12 min
## 1/16 ---> <22 min
random_state = 422

In [138]:
top_n = 16

## Import Data

In [139]:
raw_data, data_col, class_col = datasets_handler.getDataset(which_dataset)


Found cached dataset ag_news (/Users/alealcoforado/.cache/huggingface/datasets/ag_news/default/0.0.0/bc2bcb40336ace1a0374767fc29bb0296cdaf8a6da7298436239c54d79180548)


  0%|          | 0/2 [00:00<?, ?it/s]

### Data from Zero-Shot previous step

In [140]:
if use_zeroshot_previous_step == True:
    zeroshot_previous_data = datasets_handler.getZeroshotPreviousData(which_dataset,class_col,top_n=top_n,exec_time=exec_time)
    raw_data_final, new_class_col = datasets_handler.mergeLabelingToDataset(raw_data,zeroshot_previous_data,class_col)

if use_zeroshot_previous_step == False:
    raw_data_final = raw_data.copy()
    new_class_col = class_col
    raw_data_final[new_class_col] = raw_data_final[class_col]
    raw_data_final = evaluation_metrics.Encoder(raw_data,[new_class_col])


raw_data_final.columns



Index(['index', 'text', 'label', 'class', 'prediction', 'prediction_code',
       'top_probability', 'new_class', 'new_class_code'],
      dtype='object')

## Dataset Split

### Standard SetFit (Few-Shot)

In [141]:
### divide dataframe em treino e teste, com n samples para few-shot

if (split=="fewshot"):
  df_train = raw_data_final.groupby(new_class_col)[[data_col,new_class_col+"_code"]].apply(lambda s: s.sample(min(len(s),n)))
  keys = list(df_train.columns.values)
  i1 = raw_data_final.set_index(keys).index
  i2 = df_train.set_index(keys).index
  df_test = raw_data_final[~i1.isin(i2)]
  df_test = df_test.groupby(new_class_col)[[data_col,new_class_col+"_code"]].apply(lambda x:x.sample(int(len(x)*test_dataset_sample_size)))
  df_train[data_col] = df_train[data_col].astype(str)
  df_test[data_col] = df_test[data_col].astype(str)
  print(df_train.shape,df_test.shape)

### Zeroberto SetFit

In [142]:
if (split == "zeroshot"):
  df_train = raw_data_final[~raw_data_final['prediction'].isna()].groupby(new_class_col+"_code")[[data_col,new_class_col+"_code"]].apply(lambda s: s.sample(min(len(s),top_n),random_state=random_state))

  keys = list(df_train.columns.values)

  i1 = raw_data_final.set_index(keys).index
  i2 = df_train.set_index(keys).index

  df_test = raw_data_final[~i1.isin(i2)]

  df_test = df_test.groupby(new_class_col+"_code")[[data_col,new_class_col+"_code"]].apply(lambda x:x.sample(int(len(x)*test_dataset_sample_size),random_state=random_state))

  df_train[data_col] = df_train[data_col].astype(str)
  df_test[data_col] = df_test[data_col].astype(str)
  print(df_train.shape,df_test.shape)

(64, 2) (3984, 2)


### Build Dataset Dict

In [143]:
### transforma dataframes em datasetdict

train_dataset = Dataset.from_dict(df_train)
test_dataset = Dataset.from_dict(df_test)
dataset_dict = datasets.DatasetDict({"train":train_dataset,"test":test_dataset})
dataset = dataset_dict
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'new_class_code'],
        num_rows: 64
    })
    test: Dataset({
        features: ['text', 'new_class_code'],
        num_rows: 3984
    })
})

# SetFit

## Model

In [144]:
# model = saved_model


In [145]:
# model_name = "sentence-transformers/nli-roberta-base-v2"
# model_name = "ricardo-filho/bert-base-portuguese-cased-nli-assin-2"
model_name = 'sentence-transformers/stsb-xlm-r-multilingual'

#### Models
# "sentence-transformers/paraphrase-mpnet-base-v2"
# "ricardo-filho/bert-base-portuguese-cased-nli-assin-2"
# "ricardo-filho/bert-portuguese-cased-nli-assin-2"
# "sentence-transformers/nli-roberta-base-v2"
# "neuralmind/bert-large-portuguese-cased"
# "joeddav/xlm-roberta-large-xnli"
# "openai-gpt"

model = SetFitModel.from_pretrained(model_name,
                                    use_differentiable_head=True,
                                    head_params={"out_features":len(raw_data[class_col].drop_duplicates())})

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


## Parameters

In [146]:
batch_size = 8
num_text_pairs = 10
num_epochs = 1

setfit_config = {
    "model":model_name,
    "dataset":which_dataset,
    "batch_size":batch_size,
    "num_pairs":num_text_pairs,
    "num_epochs":num_epochs,
    "dataset_sample_size":test_dataset_sample_size,
    "N_examples":n,
    "split":split,
    "0shot_data":exec_time
}

# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss_class=CosineSimilarityLoss,
    batch_size=setfit_config["batch_size"],
    num_iterations=setfit_config["num_pairs"], # Number of text pairs to generate for contrastive learning
    num_epochs=setfit_config["num_epochs"], # Number of epochs to use for contrastive learning
    column_mapping = {data_col: "text", new_class_col+"_code": "label"} # NÃO mudar
)

In [147]:
%%time
# Train and evaluate
# trainer.freeze() # Freeze the head
# trainer.train() # Train only the body

# Unfreeze the head and freeze the body -> head-only training
# trainer.unfreeze(keep_body_frozen=True)

# Unfreeze the head and unfreeze the body -> end-to-end training
trainer.unfreeze(keep_body_frozen=False)

trainer.train(
    num_epochs=20, # The number of epochs to train the head or the whole model (body and head)
    batch_size=16,
    body_learning_rate=1e-5, # The body's learning rate
    learning_rate=1e-2, # The head's learning rate
    l2_weight=0.0, # Weight decay on **both** the body and head. If `None`, will use 0.01.
)

print(setfit_config)

Applying column mapping to training dataset
The `max_length` is `None`. Using the maximum acceptable length according to the current model body: 128.


Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

{'model': 'sentence-transformers/stsb-xlm-r-multilingual', 'dataset': 'ag_news', 'batch_size': 8, 'num_pairs': 10, 'num_epochs': 1, 'dataset_sample_size': 0.03125, 'N_examples': 16, 'split': 'zeroshot', '0shot_data': '2023_02_17__04_03_23'}
CPU times: user 8min 55s, sys: 12min 28s, total: 21min 24s
Wall time: 9min 28s


: 

## Training

In [84]:
%%time
# trainer.unfreeze(keep_body_frozen=True)

# trainer.train()
gc.collect()

CPU times: user 73.3 ms, sys: 101 ms, total: 175 ms
Wall time: 427 ms


4862

In [85]:
# model(["oi","sports","business and economics"])
# trainer.evaluate()

## Inference

In [86]:
%%time
y_pred = zeroberto.getPredictions(trainer)
# y_pred = zeroberto.getProbabilities(trainer)

#### folhauol: aprox. 4 predicoes por segundo
#### bbcnews: aprox. 4 preds por segundo
#### ag_news: aprox. 33 preds por segundo

Running predictions on 3984 sentences.
CPU times: user 5min 3s, sys: 35.6 s, total: 5min 38s
Wall time: 2min 8s


In [87]:
import numpy as np

In [88]:
# trainer.model.predict_proba(np.ndarray(["oi"]))

0                    business
78     science and technology
448                    sports
492                     world
Name: class, dtype: object

In [113]:
second_test_df = raw_data_final.sample(3000,random_state=422).sort_index()

(32, 3)

In [131]:
zeroberto_model = ZeroBERTo(raw_data_final['class'].drop_duplicates().to_list(),embeddingModel=trainer.model.model_body,
                            train_dataset=second_test_df)

In [127]:
zeroshot_results = zeroberto.runZeroberto(zeroberto_model,second_test_df['text'],None)  ##X## COMENTAR AQUI 


Preds: 50  - Total time: 2.65 seconds - ETA: 2.7 minutes
Preds: 100  - Total time: 5.2 seconds - ETA: 2.6 minutes
Preds: 150  - Total time: 7.8 seconds - ETA: 2.6 minutes
Preds: 200  - Total time: 10.44 seconds - ETA: 2.6 minutes
Preds: 250  - Total time: 13.07 seconds - ETA: 2.6 minutes
Preds: 300  - Total time: 15.71 seconds - ETA: 2.6 minutes
Preds: 350  - Total time: 18.35 seconds - ETA: 2.6 minutes
Preds: 400  - Total time: 20.9 seconds - ETA: 2.6 minutes
Preds: 450  - Total time: 23.55 seconds - ETA: 2.6 minutes
Preds: 500  - Total time: 26.11 seconds - ETA: 2.6 minutes
Preds: 550  - Total time: 28.6 seconds - ETA: 2.6 minutes
Preds: 600  - Total time: 31.15 seconds - ETA: 2.6 minutes
Preds: 650  - Total time: 33.71 seconds - ETA: 2.6 minutes
Preds: 700  - Total time: 36.17 seconds - ETA: 2.6 minutes
Preds: 750  - Total time: 38.55 seconds - ETA: 2.6 minutes
Preds: 800  - Total time: 40.99 seconds - ETA: 2.6 minutes
Preds: 850  - Total time: 43.34 seconds - ETA: 2.5 minutes
Preds

In [132]:
df_results = zeroberto_model.evaluateLabeling(zeroshot_results)

top 1: {'accuracy': 0.75}
top 2: {'accuracy': 0.75}
top 3: {'accuracy': 0.6666666666666666}
top 4: {'accuracy': 0.75}
top 5: {'accuracy': 0.8}
top 6: {'accuracy': 0.8333333333333334}
top 7: {'accuracy': 0.8214285714285714}
top 8: {'accuracy': 0.8125}
top 9: {'accuracy': 0.8055555555555556}
top 10: {'accuracy': 0.825}
top 11: {'accuracy': 0.7954545454545454}
top 12: {'accuracy': 0.7916666666666666}
top 13: {'accuracy': 0.8076923076923077}
top 14: {'accuracy': 0.8214285714285714}
top 15: {'accuracy': 0.8166666666666667}
top 16: {'accuracy': 0.828125}
top 3000: {'accuracy': 0.658}


In [133]:
all_metrics = evaluation_metrics.get_metrics(df_results['prediction_code'].to_list(),df_results['class_code'].to_list())
print(all_metrics)

{'weighted': [{'accuracy': 0.658}, {'precision': 0.6625285465311167}, {'recall': 0.658}, {'f1': 0.6571961559436302}], 'macro': [{'accuracy': 0.658}, {'precision': 0.6635544905870477}, {'recall': 0.6593758310172948}, {'f1': 0.6584169120317461}]}


# Evaluation

## Metrics

In [89]:
try: print(all_metrics)
except: pass
all_metrics = evaluation_metrics.get_metrics(y_pred ,test_dataset[new_class_col+"_code"])
print(all_metrics)

{'weighted': [{'accuracy': 0.6603915662650602}, {'precision': 0.7123584970780007}, {'recall': 0.6603915662650602}, {'f1': 0.6399341962236867}], 'macro': [{'accuracy': 0.6603915662650602}, {'precision': 0.7123584970780007}, {'recall': 0.6603915662650602}, {'f1': 0.6399341962236867}]}
{'weighted': [{'accuracy': 0.6636546184738956}, {'precision': 0.7142491479964926}, {'recall': 0.6636546184738956}, {'f1': 0.6450966749572223}], 'macro': [{'accuracy': 0.6636546184738956}, {'precision': 0.7142491479964928}, {'recall': 0.6636546184738956}, {'f1': 0.6450966749572222}]}


## Save Results

In [81]:
setfit_exec_time  = evaluation_metrics.saveResults(setfit_config,all_metrics)
print(setfit_config)

metrics_setfit_2023_02_17__03_01_11.csv
config_setfit_2023_02_17__03_01_11.csv
{'model': 'sentence-transformers/stsb-xlm-r-multilingual', 'dataset': 'ag_news', 'batch_size': 8, 'num_pairs': 20, 'num_epochs': 1, 'dataset_sample_size': 0.03125, 'N_examples': 8, 'split': 'zeroshot', '0shot_data': '2023_02_17__01_35_09'}


# Save Model

In [None]:
model_path = "/Users/alealcoforado/Documents/Projetos/Modelos/{exec}".format(exec=setfit_exec_time)
print(model_path)
trainer.model._save_pretrained(save_directory=model_path)

## Load Model

In [None]:
load_path = r'/Users/alealcoforado/Documents/Projetos/Modelos/setfit_top_n=4_n=8_15pairs_v1'

saved_model = SetFitModel._from_pretrained(load_path)
model_name = load_path

{'weighted': [{'accuracy': 0.7208791208791209},
  {'precision': 0.7966533533828771},
  {'recall': 0.7208791208791209},
  {'f1': 0.7383634133416869}],
 'macro': [{'accuracy': 0.7208791208791209},
  {'precision': 0.47294791260373964},
  {'recall': 0.6956977371368128},
  {'f1': 0.5148314501574386}]}

  {'weighted': [{'accuracy': 0.7088331515812432},
  {'precision': 0.8112290505989518},
  {'recall': 0.7088331515812432},
  {'f1': 0.7416526123050681}],
 'macro': [{'accuracy': 0.7088331515812432},
  {'precision': 0.5086466490124514},
  {'recall': 0.7260511792805407},
  {'f1': 0.5487853669870654}]}

1/4 dataset
  {'weighted': [{'accuracy': 0.712040293517433},
  {'precision': 0.7969596487586078},
  {'recall': 0.712040293517433},
  {'f1': 0.7365853189562236}],
 'macro': [{'accuracy': 0.712040293517433},
  {'precision': 0.4964842305418267},
  {'recall': 0.7028323441240695},
  {'f1': 0.5374745594204607}]}

full dataset
{'weighted': [{'accuracy': 0.7090563785338325},
  {'precision': 0.7967113724849181},
  {'recall': 0.7090563785338325},
  {'f1': 0.734316306685027}],
 'macro': [{'accuracy': 0.7090563785338325},
  {'precision': 0.4954781686924334},
  {'recall': 0.7020811904269998},
  {'f1': 0.5360406756304892}]}