<a href="https://colab.research.google.com/github/alturkim/nlp-notebooks/blob/main/Few_Shot_Arabic_Sentiment_Analysis_using_SetFit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install setfit
!pip install datasets evaluate transformers[sentencepiece]
!pip install sentence-transformers

In [None]:
from datasets import load_dataset
from datasets.arrow_dataset import Dataset
from sentence_transformers.losses import CosineSimilarityLoss

from setfit import SetFitModel, SetFitTrainer

from typing import Dict

In [None]:
dataset = load_dataset("ar_res_reviews", split="train")
dataset = dataset.rename_column("polarity", "label")
dataset = dataset.remove_columns(["restaurant_id", "user_id"])


Downloading builder script:   0%|          | 0.00/3.11k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/4.79k [00:00<?, ?B/s]

Downloading and preparing dataset ar_res_reviews/default to /root/.cache/huggingface/datasets/ar_res_reviews/default/0.0.0/f303714dc96c8056d45dca8950e5b7fe6ad59b88d0c095e07724e0484824031c...


Downloading data:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8364 [00:00<?, ? examples/s]

Dataset ar_res_reviews downloaded and prepared to /root/.cache/huggingface/datasets/ar_res_reviews/default/0.0.0/f303714dc96c8056d45dca8950e5b7fe6ad59b88d0c095e07724e0484824031c. Subsequent calls will reuse this data.


In [None]:
def train_eval_test_split(dataset : Dataset) -> Dict[str, Dataset]:
    split_datasets = dict()
    train_eval_test = dataset.train_test_split(test_size=0.4, stratify_by_column="label", seed=10)
    split_datasets["train"] = train_eval_test["train"]
    eval_test = train_eval_test["test"].train_test_split(test_size=0.5, stratify_by_column="label", seed=10)
    split_datasets["test"] = eval_test["test"]
    return split_datasets

In [None]:
# we are using the same split as the previous notebooks to compare the performance of the same test set
split_datasets = train_eval_test_split(dataset)

In [None]:
# We do not need all the training samples, we are just going to select few samples from each class

examples_per_class = 64
for seed in range(1000):
    split: dict = split_datasets["train"].train_test_split(train_size=examples_per_class*2, shuffle=True, seed=seed) 
    train_ds = split["train"]
    test_ds = split_datasets["test"]
    if sum(train_ds["label"]) == int(0.7*2*examples_per_class):
        print("seed:", seed)
        break
print(train_ds["label"])
print(test_ds)

seed: 6
[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0]
Dataset({
    features: ['label', 'text'],
    num_rows: 1673
})


In [None]:
# Load SetFit model from Hub
model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

# Create trainer
trainer = SetFitTrainer(
    model=model,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    loss_class=CosineSimilarityLoss,
    batch_size=16,
    num_iterations=40, # Number of text pairs to generate for contrastive learning
    num_epochs=4 # Number of epochs to use for contrastive learning
)

In [None]:

# Train and evaluate!
trainer.train()
metrics = trainer.evaluate()
print(metrics)

***** Running training *****
  Num examples = 10240
  Num epochs = 4
  Total optimization steps = 2560
  Total train batch size = 16


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2560 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2560 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2560 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2560 [00:00<?, ?it/s]

***** Running evaluation *****


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

{'accuracy': 0.8123132098027496}
