In [1]:
from transformers import DataCollatorWithPadding

In [2]:
from transformers import AutoTokenizer

In [3]:
from datasets import load_dataset

In [4]:
ds = load_dataset("csv", data_files="books_train.csv")

In [5]:
ds = ds["train"].train_test_split(test_size=0.333)

In [6]:
train_dataset = ds["train"]
test_dataset = ds["test"]

In [7]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [8]:
def preprocess_function(examples):
   return tokenizer(examples["body"], truncation=True)
 
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [9]:
test_dataset

Dataset({
    features: ['body', 'label'],
    num_rows: 100
})

In [10]:
tokenizer(test_dataset[10]['body'], truncation=True)

{'input_ids': [101, 1045, 2018, 2000, 3191, 1996, 2331, 1997, 7332, 6335, 28008, 2818, 2005, 2028, 1997, 2026, 4280, 2023, 13609, 1012, 1045, 3517, 2009, 2000, 2022, 1037, 8011, 2021, 2009, 11476, 2135, 2718, 2033, 2524, 2009, 1005, 1055, 1037, 2200, 2460, 2338, 2012, 2055, 2753, 2070, 5976, 5530, 1010, 2021, 6195, 1045, 2001, 26536, 18483, 2009, 1999, 1996, 2168, 13609, 2004, 10250, 2278, 2462, 1010, 16012, 5403, 2213, 1010, 1998, 16545, 2078, 3523, 1010, 2009, 2790, 2066, 2498, 2021, 1037, 2051, 5949, 1006, 2026, 6346, 2005, 2635, 1037, 4695, 2465, 2076, 1037, 9742, 13609, 1007, 1010, 2021, 1045, 2941, 3092, 2039, 8295, 2009, 2061, 2172, 1045, 3191, 1996, 2972, 2518, 1999, 2028, 3564, 2750, 2009, 2108, 3714, 2039, 1999, 1996, 25353, 4571, 8286, 2058, 1017, 3134, 1012, 7332, 2001, 2521, 2013, 2108, 2151, 4066, 1997, 13026, 2839, 1010, 1998, 2130, 2012, 1996, 2203, 1997, 1996, 2338, 1045, 2179, 2870, 2025, 3110, 2919, 2005, 2032, 1010, 2021, 1996, 4784, 1998, 8474, 2716, 2039, 1999, 19

In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Training the model 

In [12]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
import numpy as np
import evaluate
 

In [14]:
def compute_metrics(eval_pred):
   load_accuracy = evaluate.load("accuracy")
   load_f1 = evaluate.load("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)
   f1 = load_f1.compute(predictions=predictions, references=labels)
   return {"accuracy": accuracy, "f1": f1}

In [17]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
tokenized_train

Dataset({
    features: ['body', 'label', 'input_ids', 'attention_mask'],
    num_rows: 200
})

In [19]:
tokenized_test

Dataset({
    features: ['body', 'label', 'input_ids', 'attention_mask'],
    num_rows: 100
})

In [20]:
from transformers import TrainingArguments, Trainer
 
repo_name = "finetuning-sentiment-model-3000-samples"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)
 
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  return torch._C._cuda_getDeviceCount() > 0
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disa

In [21]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=26, training_loss=0.6071650798504169, metrics={'train_runtime': 212.0856, 'train_samples_per_second': 1.886, 'train_steps_per_second': 0.123, 'total_flos': 51527748274080.0, 'train_loss': 0.6071650798504169, 'epoch': 2.0})

In [22]:
trainer.evaluate()

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

{'eval_loss': 0.5749530792236328,
 'eval_accuracy': {'accuracy': 0.73},
 'eval_f1': {'f1': 0.8439306358381503},
 'eval_runtime': 13.2702,
 'eval_samples_per_second': 7.536,
 'eval_steps_per_second': 0.527,
 'epoch': 2.0}

In [21]:
trainer.evaluate()

{'eval_loss': 0.3684280514717102,
 'eval_accuracy': {'accuracy': 0.8766666666666667},
 'eval_f1': {'f1': 0.8794788273615635},
 'eval_runtime': 5.8077,
 'eval_samples_per_second': 51.655,
 'eval_steps_per_second': 3.271,
 'epoch': 2.0}