# === ZeroBERTo Fit ===

# Dependencies

In [None]:
# !pip install setfit
# !pip install datasets
%load_ext autoreload
%autoreload 2

In [None]:
# from google.colab import drive
# import pandas as pd

# drive.mount('/content/drive')

In [None]:
import pandas as pd
import datasets_handler
import datasets
from datasets import Dataset
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitModel, SetFitTrainer, sample_dataset
import evaluation_metrics
import gc
import zeroberto
tqdm.pandas()


# Data Prep

In [None]:
use_zeroshot_previous_step = True
exec_time = 'first_test'

which_dataset = 'folhauol'
# which_dataset = 'bbc-news'
# which_dataset = 'ag_news'


split = "zeroshot" if use_zeroshot_previous_step else "fewshot"
n = 4
test_dataset_sample_size = 1/320
## 1/64 ---> <6 min
## 1/32 ---> <12 min
## 1/16 ---> <22 min
random_state = 422

In [None]:
top_n = 4

## Import Data

In [None]:
raw_data, data_col, class_col = datasets_handler.getDataset(which_dataset)


### Data from Zero-Shot previous step

In [None]:
if use_zeroshot_previous_step == True:
    zeroshot_previous_data = datasets_handler.getZeroshotPreviousData(which_dataset,class_col,top_n=top_n,exec_time=exec_time)
    raw_data_final, new_class_col = datasets_handler.mergeLabelingToDataset(raw_data,zeroshot_previous_data,class_col)
#     raw_data_final = raw_data.join(zeroshot_previous_data)
# #     ## overwrite true labels with predictions from zeroshot
#     new_class_col = 'new_'+class_col
#     raw_data_final.loc[~raw_data_final['prediction'].isna(),new_class_col] = raw_data_final['prediction'] 
#     raw_data_final.loc[raw_data_final['prediction'].isna(),new_class_col] = raw_data_final[class_col]

# #     ## keep true labels of the rest, for testing
#     raw_data_final = evaluation_metrics.Encoder(raw_data_final,[new_class_col])

if use_zeroshot_previous_step == False:
    raw_data_final = raw_data.copy()
    new_class_col = class_col
    raw_data_final[new_class_col] = raw_data_final[class_col]
    raw_data_final = evaluation_metrics.Encoder(raw_data,[new_class_col])


raw_data_final.columns



## Dataset Split

### Standard SetFit (Few-Shot)

In [None]:
### divide dataframe em treino e teste, com n samples para few-shot

if (split=="fewshot"):
  df_train = raw_data_final.groupby(new_class_col)[[data_col,new_class_col+"_code"]].apply(lambda s: s.sample(min(len(s),n)))
  keys = list(df_train.columns.values)
  i1 = raw_data_final.set_index(keys).index
  i2 = df_train.set_index(keys).index
  df_test = raw_data_final[~i1.isin(i2)]
  df_test = df_test.groupby(new_class_col)[[data_col,new_class_col+"_code"]].apply(lambda x:x.sample(int(len(x)*test_dataset_sample_size)))
  df_train[data_col] = df_train[data_col].astype(str)
  df_test[data_col] = df_test[data_col].astype(str)
  print(df_train.shape,df_test.shape)

### Zeroberto SetFit

In [None]:
if (split == "zeroshot"):
  df_train = raw_data_final[~raw_data_final['prediction'].isna()].groupby(new_class_col+"_code")[[data_col,new_class_col+"_code"]].apply(lambda s: s.sample(min(len(s),top_n),random_state=random_state))

  keys = list(df_train.columns.values)

  i1 = raw_data_final.set_index(keys).index
  i2 = df_train.set_index(keys).index

  df_test = raw_data_final[~i1.isin(i2)]

  df_test = df_test.groupby(new_class_col+"_code")[[data_col,new_class_col+"_code"]].apply(lambda x:x.sample(int(len(x)*test_dataset_sample_size),random_state=random_state))

  df_train[data_col] = df_train[data_col].astype(str)
  df_test[data_col] = df_test[data_col].astype(str)
  print(df_train.shape,df_test.shape)

### Build Dataset Dict

In [None]:
### transforma dataframes em datasetdict

train_dataset = Dataset.from_dict(df_train)
test_dataset = Dataset.from_dict(df_test)
dataset_dict = datasets.DatasetDict({"train":train_dataset,"test":test_dataset})
dataset = dataset_dict
dataset

# SetFit

## Model

In [None]:
# model = saved_model


In [None]:
# model_name = "sentence-transformers/nli-roberta-base-v2"
model_name = "ricardo-filho/bert-base-portuguese-cased-nli-assin-2"


#### Models
# "sentence-transformers/paraphrase-mpnet-base-v2"
# "ricardo-filho/bert-base-portuguese-cased-nli-assin-2"
# "ricardo-filho/bert-portuguese-cased-nli-assin-2"
# "sentence-transformers/nli-roberta-base-v2"
# "neuralmind/bert-large-portuguese-cased"
# "joeddav/xlm-roberta-large-xnli"
# "openai-gpt"

model = SetFitModel.from_pretrained(model_name,
                                    use_differentiable_head=True,
                                    head_params={"out_features":len(raw_data[class_col].drop_duplicates())})

## Parameters

In [None]:
batch_size = 8
num_text_pairs = 15
num_epochs = 1

setfit_config = {
    "model":model_name,
    "dataset":which_dataset,
    "batch_size":batch_size,
    "num_pairs":num_text_pairs,
    "num_epochs":num_epochs,
    "dataset_sample_size":test_dataset_sample_size,
    "N_examples":n,
    "split":split,
    "0shot_data":exec_time
}

# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    loss_class=CosineSimilarityLoss,
    batch_size=setfit_config["batch_size"],
    num_iterations=setfit_config["num_pairs"], # Number of text pairs to generate for contrastive learning
    num_epochs=setfit_config["num_epochs"], # Number of epochs to use for contrastive learning
    column_mapping = {data_col: "text", new_class_col+"_code": "label"} # NÃO mudar
)

In [None]:
%%time
# Train and evaluate
# trainer.freeze() # Freeze the head
# trainer.train() # Train only the body

# Unfreeze the head and freeze the body -> head-only training
# trainer.unfreeze(keep_body_frozen=True)

# Unfreeze the head and unfreeze the body -> end-to-end training
trainer.unfreeze(keep_body_frozen=False)

trainer.train(
    num_epochs=25, # The number of epochs to train the head or the whole model (body and head)
    batch_size=2,
    body_learning_rate=1e-5, # The body's learning rate
    learning_rate=1e-2, # The head's learning rate
    l2_weight=0.0, # Weight decay on **both** the body and head. If `None`, will use 0.01.
)

print(setfit_config)

## Training

In [None]:
%%time
# trainer.unfreeze(keep_body_frozen=True)

# trainer.train()
gc.collect()

In [None]:
model(["oi","sports","business and economics"])
# trainer.evaluate()

## Inference

In [60]:
%%time
# y_pred = zeroberto.getPredictions(trainer)
y_pred = zeroberto.getProbabilities(trainer)

#### folhauol: aprox. 4 predicoes por segundo
#### bbcnews: aprox. 4 preds por segundo
#### ag_news: aprox. 33 preds por segundo

<class 'torch.device'>
Running predictions (with probabilities) on 282 sentences.


TypeError: linear(): argument 'input' (position 1) must be Tensor, not numpy.ndarray

# Evaluation

## Metrics

In [None]:
try: print(all_metrics)
except: pass
all_metrics = evaluation_metrics.get_metrics(y_pred ,test_dataset[new_class_col+"_code"])
print(all_metrics)

## Save Results

In [None]:
setfit_exec_time  = evaluation_metrics.saveResults(setfit_config,all_metrics)
print(setfit_config)

# Save Model

In [None]:
model_path = "/Users/alealcoforado/Documents/Projetos/Modelos/{exec}".format(exec=setfit_exec_time)
print(model_path)
trainer.model._save_pretrained(save_directory=model_path)

## Load Model

In [None]:
load_path = r'/Users/alealcoforado/Documents/Projetos/Modelos/setfit_top_n=4_n=8_15pairs_v1'

saved_model = SetFitModel._from_pretrained(load_path)
model_name = load_path

{'weighted': [{'accuracy': 0.7208791208791209},
  {'precision': 0.7966533533828771},
  {'recall': 0.7208791208791209},
  {'f1': 0.7383634133416869}],
 'macro': [{'accuracy': 0.7208791208791209},
  {'precision': 0.47294791260373964},
  {'recall': 0.6956977371368128},
  {'f1': 0.5148314501574386}]}

  {'weighted': [{'accuracy': 0.7088331515812432},
  {'precision': 0.8112290505989518},
  {'recall': 0.7088331515812432},
  {'f1': 0.7416526123050681}],
 'macro': [{'accuracy': 0.7088331515812432},
  {'precision': 0.5086466490124514},
  {'recall': 0.7260511792805407},
  {'f1': 0.5487853669870654}]}

1/4 dataset
  {'weighted': [{'accuracy': 0.712040293517433},
  {'precision': 0.7969596487586078},
  {'recall': 0.712040293517433},
  {'f1': 0.7365853189562236}],
 'macro': [{'accuracy': 0.712040293517433},
  {'precision': 0.4964842305418267},
  {'recall': 0.7028323441240695},
  {'f1': 0.5374745594204607}]}

full dataset
{'weighted': [{'accuracy': 0.7090563785338325},
  {'precision': 0.7967113724849181},
  {'recall': 0.7090563785338325},
  {'f1': 0.734316306685027}],
 'macro': [{'accuracy': 0.7090563785338325},
  {'precision': 0.4954781686924334},
  {'recall': 0.7020811904269998},
  {'f1': 0.5360406756304892}]}