# Datasets

In [5]:
from datasets import load_dataset

dataset = load_dataset('rotten_tomatoes')

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

In [7]:
dataset.__class__

datasets.dataset_dict.DatasetDict

In [9]:
dataset['train'].__class__

datasets.arrow_dataset.Dataset

In [15]:
dataset['train'][0:4]

{'text': ['the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .',
  'the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .',
  'effective but too-tepid biopic',
  'if you sometimes like to go to the movies to have fun , wasabi is a good place to start .'],
 'label': [1, 1, 1, 1]}

In [17]:
dataset['train']['text'][0]

'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'

In [19]:
dataset['train'][0]['text']

'the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .'

In [21]:
from time import time

start = time()
dataset['train'][0]['text']
end = time()
print(f">>> elapsed time: {end-start}s")

start = time()
dataset['train']['text'][0]
end = time()
print(f">>> elapsed time: {end-start}s")

>>> elapsed time: 0.0009989738464355469s
>>> elapsed time: 0.012161970138549805s


In [22]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [23]:
tokenizer(dataset['train'][0]['text'])

{'input_ids': [101, 1996, 2600, 2003, 16036, 2000, 2022, 1996, 7398, 2301, 1005, 1055, 2047, 1000, 16608, 1000, 1998, 2008, 2002, 1005, 1055, 2183, 2000, 2191, 1037, 17624, 2130, 3618, 2084, 7779, 29058, 8625, 13327, 1010, 3744, 1011, 18856, 19513, 3158, 5477, 4168, 2030, 7112, 16562, 2140, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [29]:
def tokenization(x):
    return tokenizer(x['text'])

# dataset = dataset.map(tokenization, batch_size=1000)
dataset = dataset.map(tokenization, batched=True)
dataset

Map: 100%|███████████████████████████████████████████████████████████████| 8530/8530 [00:00<00:00, 14136.91 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████| 1066/1066 [00:00<00:00, 14786.26 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████| 1066/1066 [00:00<00:00, 17250.52 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8530
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1066
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1066
    })
})

In [30]:
from datasets import Dataset, DatasetDict

ds = Dataset.from_dict({
    "pokemon": ['picachu', 'tortuga'],
    "type": ['water', 'grass']
})
ds

Dataset({
    features: ['pokemon', 'type'],
    num_rows: 2
})

In [31]:
ds.to_pandas()

Unnamed: 0,pokemon,type
0,picachu,water
1,tortuga,grass


# Fine tuning (ejemplo)

In [32]:
import pandas as pd

df = pd.read_csv("https://raw.githubusercontent.com/eduardofc/data/main/amazon_sports.csv")
df['review_body'] = df['review_body'].str.replace("[^a-zA-ZñÑáéíóú .,]", "", regex=True)
df['review_body'] = df['review_body'].str.lower()
df.head()

Unnamed: 0,stars,review_body,review_title,product_category
0,1,nunca llego el pedido y el vendedor pasa de to...,No llego nunca,sports
1,1,"no sé como es, porque debería haber llegado ay...",Todavía no ha llegado,sports
2,1,"guantes cómodos, no lo niego, pero de mala cal...",Guantes de baja calidad,sports
3,1,hasta hoy no he visto el producto. el pedido h...,Muy Mala experiencia,sports
4,1,"no puedo valorarla porque, después de casi una...",Paquete perdido?,sports


In [33]:
df = df[df.stars != 2]
df = df[df.stars != 3]
df = df[df.stars != 4]

df['good_product'] = (df.stars > 3).astype(int)

df.groupby('good_product').size()

good_product
0    2438
1    2512
dtype: int64

In [60]:
ds = Dataset.from_pandas(df)
ds = ds.remove_columns(['stars', 'review_title', 'product_category', '__index_level_0__'])
ds = ds.rename_column("good_product", "labels")
ds = ds.rename_column("review_body", "text")
ds = ds.train_test_split(test_size=.2, seed=99)
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 3960
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 990
    })
})

In [61]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_id = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [68]:
def tokenization(x):
    return tokenizer(x['text'], truncation=True)

tokenized_ds = ds.map(tokenization, batched=True)
tokenized_ds

Map: 100%|███████████████████████████████████████████████████████████████| 3960/3960 [00:00<00:00, 19367.67 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████| 990/990 [00:00<00:00, 20449.24 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 3960
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 990
    })
})

# Training

In [69]:
from transformers import TrainingArguments

training_args = TrainingArguments(model_id)

In [70]:
# training_args.weight_decay = 0.1

In [71]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
)

In [76]:
from transformers import Trainer
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_ds['train'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


In [77]:
%%time

trainer.train()

Step,Training Loss
500,0.1363
1000,0.0739


CPU times: total: 2min 40s
Wall time: 2min 40s


TrainOutput(global_step=1485, training_loss=0.08127385662862348, metrics={'train_runtime': 159.8377, 'train_samples_per_second': 74.325, 'train_steps_per_second': 9.291, 'total_flos': 390056464751904.0, 'train_loss': 0.08127385662862348, 'epoch': 3.0})

# Evaluación

In [83]:
preds = trainer.predict(tokenized_ds['test'])

In [84]:
import numpy as np

preds = np.argmax(preds.predictions, 1)
preds[:5]

array([1, 1, 1, 1, 0], dtype=int64)

In [85]:
from sklearn.metrics import accuracy_score, f1_score

accuracy_score(y_true=tokenized_ds['test']['labels'], y_pred=preds)

0.9262626262626262

In [86]:
f1_score(y_true=tokenized_ds['test']['labels'], y_pred=preds)

0.9257375381485249