# Classification with Generative Models
## Encoder Decoder Models

In [1]:
from datasets import load_dataset
data = load_dataset("rotten_tomatoes")
data

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [2]:
from transformers import pipeline

pipe = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    device="cuda:0"
)


W0125 08:13:26.557000 37756 Lib\site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


In [None]:
prompt = "Is the following sentence positive or negative?"
data = data.map(lambda example: {"t5": prompt + example['text']})
data



Map: 100%|██████████| 8530/8530 [00:01<00:00, 7685.26 examples/s]
Map: 100%|██████████| 1066/1066 [00:00<00:00, 10901.39 examples/s]
Map: 100%|██████████| 1066/1066 [00:00<00:00, 19441.55 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
})

In [6]:
data["train"][0]

{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
 'label': 1,
 't5': 'Is the following sentence positive or negative? the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}

In [12]:
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset
y_pred= []
for output in tqdm(pipe(KeyDataset(data["test"], "t5")), total=len(data["test"])):
    text = output[0]["generated_text"]
    y_pred.append(0 if text == "negative" else 1)
    

100%|██████████| 1066/1066 [00:44<00:00, 24.20it/s]


In [13]:
from sklearn.metrics import classification_report

def evaluate_performance(y_true,y_pred):
    performance = classification_report(y_true,y_pred,target_names=["Negative Review", "Positive Review"])
    print(performance)

In [14]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.83      0.85      0.84       533
Positive Review       0.85      0.83      0.84       533

       accuracy                           0.84      1066
      macro avg       0.84      0.84      0.84      1066
   weighted avg       0.84      0.84      0.84      1066



## Change the prompt

In [20]:
prompt = "Predict whether the provided text is a positive or negative movie review"
data = data.map(lambda example: {"t5": prompt + example['text']})
data

Map: 100%|██████████| 8530/8530 [00:00<00:00, 20535.60 examples/s]
Map: 100%|██████████| 1066/1066 [00:00<00:00, 17767.32 examples/s]
Map: 100%|██████████| 1066/1066 [00:00<00:00, 17272.38 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 't5'],
        num_rows: 1066
    })
})

In [21]:
data["train"][0]

{'text': 'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
 'label': 1,
 't5': 'Predict whether the provided text is a positive or negative movie reviewthe rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'}

In [22]:
from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset
y_pred= []
for output in tqdm(pipe(KeyDataset(data["test"], "t5")), total=len(data["test"])):
    text = output[0]["generated_text"]
    y_pred.append(0 if text == "negative" else 1)

100%|██████████| 1066/1066 [00:47<00:00, 22.54it/s]


In [23]:
from sklearn.metrics import classification_report

def evaluate_performance(y_true,y_pred):
    performance = classification_report(y_true,y_pred,target_names=["Negative Review", "Positive Review"])
    print(performance)

In [24]:
evaluate_performance(data["test"]["label"], y_pred)

                 precision    recall  f1-score   support

Negative Review       0.80      0.90      0.85       533
Positive Review       0.89      0.77      0.83       533

       accuracy                           0.84      1066
      macro avg       0.84      0.84      0.84      1066
   weighted avg       0.84      0.84      0.84      1066

