# Requirements

In [None]:
! pip install transformers[torch] datasets evaluate tf-keras

In [83]:
from datasets import load_dataset

# Laboratory Exercise - Run Mode (8 points)

## Introduction
This laboratory assignment's primary objective is to fine-tune a pre-trained language model for binary classification on a dataset consisting of wine reviews. The dataset contains two attributes: **description** and **points**. The description is a brief text describing the wine and the points represent a quality metric ranging from 1 to 100. If some wine has at least 90 points it is considered **exceptional**. Your task involves predicting if some wine is **exceptional** based on its review.

## The Wine Reviews Dataset

Load the dataset using the `datasets` library.

In [84]:
data =  load_dataset('csv', data_files='wine-reviews.csv')

In [85]:
data

DatasetDict({
    train: Dataset({
        features: ['description', 'points'],
        num_rows: 10000
    })
})

## Target Extraction
Extract the target **exceptional** for each wine review. If some wine has at least 90 points it is considered **exceptional**.

In [86]:
def add_exceptional_label(example):
    example["exceptional"] = 1 if example["points"] >= 90 else 0
    return example

data = data.map(add_exceptional_label)

In [87]:
df = data['train'].to_pandas()
df.head()   

Unnamed: 0,description,points,exceptional
0,"Translucent in color, silky in the mouth, this...",85,0
1,"On the palate, this wine is rich and complex, ...",92,1
2,The producer blends 57% Chardonnay from the Ma...,92,1
3,"Pure Baga in all its glory, packed with dry an...",93,1
4,Think of Subsídio as a contribution rather tha...,89,0


## Dataset Splitting
Partition the dataset into training and testing sets with an 80:20 ratio.


In [88]:
data['train'].train_test_split(test_size=0.2)

DatasetDict({
    train: Dataset({
        features: ['description', 'points', 'exceptional'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['description', 'points', 'exceptional'],
        num_rows: 2000
    })
})

In [89]:
data = data['train'].train_test_split(test_size=0.2)

## Tokenization
Tokenize the texts using the `AutoTokenizer` class.

In [104]:
from transformers import AutoTokenizer

In [105]:
def tokenize_function(examples):
    return tokenizer(examples["description"], padding="max_length", truncation=True, max_length=512)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  
tokenized_datasets = data.map(tokenize_function, batched=True)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

## Fine-tuning a Pre-trained Language Model for Classification
Fine-tune a pre-trained language model for classification on the given dataset.

Define the model using the `AutoModelForSequenceClassification` class.

In [132]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Define the traning parameters using the `TrainingArguments` class.

In [133]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="test_trainer",
    learning_rate=5e-5
)

In [134]:
import numpy as np
import evaluate

metric = evaluate.load("f1")

In [135]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [136]:
from transformers import Trainer

training_args = TrainingArguments(
    output_dir="test_trainer",
    eval_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    metric_for_best_model="f1",
)

Define the training using the `Trainer` class.

In [137]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['description', 'points', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['description', 'points', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [138]:
if 'labels' not in tokenized_datasets['train'].features:
    tokenized_datasets = tokenized_datasets.map(
        lambda x: {'labels': int(x['exceptional'])},
        remove_columns=['exceptional']  # Remove the original field
    )

In [142]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    compute_metrics=compute_metrics,
    processing_class=tokenizer,  
)

Fine-tune (train) the pre-trained lanugage model.

In [143]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [144]:
print(device)

cpu


Use the trained model to make predictions for the test set.

In [145]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['description', 'points', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['description', 'points', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [118]:
print(tokenized_datasets['train'][0].keys())

dict_keys(['description', 'points', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [146]:
trainer.train()

  0%|          | 0/3000 [00:00<?, ?it/s]

IndexError: index out of range in self

Assess the performance of the model by using different metrics provided by the `scikit-learn` library.

In [None]:
# Write your code here. Add as many boxes as you need.

# Laboratory Exercise - Bonus Task (+ 2 points)

Implement a simple machine learning pipeline to classify wine reviews as **exceptional** or not. Use TF-IDF vectorization to convert text into numerical features and train a logistic regression. Split the dataset into training and testing sets, fit the pipeline on the training data, and evaluate its performance using metrics such as precision, recall, and F1-score. Analyze the texts to find the most influential words or phrases associated with the **exceptional** wines. Use the coefficients from the logistic regression trained on TF-IDF features to identify the top positive and negative keywords for **exceptional** wines. Present these keywords in a simple table or visualization (e.g., bar chart).

In [None]:
# Write your code here. Add as many boxes as you need.