In [21]:
from transformers import DataCollatorWithPadding

In [22]:
from transformers import AutoTokenizer

In [23]:
from datasets import load_dataset

In [24]:
ds = load_dataset("csv", data_files="books_train.csv")

In [25]:
ds = ds["train"].train_test_split(test_size=0.333)

In [26]:
train_dataset = ds["train"]
test_dataset = ds["test"]

In [27]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [28]:
def preprocess_function(examples):
   return tokenizer(examples["body"], truncation=True)
 
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [29]:
test_dataset

Dataset({
    features: ['body', 'label'],
    num_rows: 100
})

In [30]:
tokenizer(test_dataset[10]['body'], truncation=True)

{'input_ids': [101, 1046, 1012, 1040, 1012, 16183, 9912, 1027, 2417, 5210, 1029, 2339, 2003, 2009, 2061, 2691, 5921, 2338, 2111, 2000, 23084, 2009, 2004, 1037, 2417, 5210, 2065, 2619, 7777, 16183, 9912, 2015, 2808, 1029, 1045, 3984, 2116, 2111, 7868, 2008, 2065, 2619, 7777, 13795, 1999, 1996, 20926, 1010, 2008, 2711, 2064, 6709, 2007, 9988, 6187, 21007, 12891, 1029, 1999, 2755, 2045, 2024, 2061, 2116, 2060, 2808, 1999, 2029, 1996, 21989, 2024, 2061, 2172, 4788, 1010, 2008, 4995, 1005, 1056, 2464, 2004, 2417, 9245, 1010, 2061, 1045, 2428, 2123, 1005, 1056, 2131, 2009, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [31]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Training the model 

In [32]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
import numpy as np
import evaluate
 

In [34]:
def compute_metrics(eval_pred):
   load_accuracy = evaluate.load("accuracy")
   load_f1 = evaluate.load("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)
   f1 = load_f1.compute(predictions=predictions, references=labels)
   return {"accuracy": accuracy, "f1": f1}

In [36]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [37]:
tokenized_train

Dataset({
    features: ['body', 'label', 'input_ids', 'attention_mask'],
    num_rows: 200
})

In [38]:
tokenized_test

Dataset({
    features: ['body', 'label', 'input_ids', 'attention_mask'],
    num_rows: 100
})

In [39]:
from transformers import TrainingArguments, Trainer
 
repo_name = "finetuning-sentiment-model-3000-samples"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)
 
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [40]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=26, training_loss=0.6131710639366736, metrics={'train_runtime': 66.3238, 'train_samples_per_second': 6.031, 'train_steps_per_second': 0.392, 'total_flos': 52800677183040.0, 'train_loss': 0.6131710639366736, 'epoch': 2.0})

In [41]:
trainer.evaluate()

{'eval_loss': 0.6117655038833618,
 'eval_accuracy': {'accuracy': 0.69},
 'eval_f1': {'f1': 0.8165680473372781},
 'eval_runtime': 6.0497,
 'eval_samples_per_second': 16.53,
 'eval_steps_per_second': 1.157,
 'epoch': 2.0}

In [42]:
trainer.evaluate()

{'eval_loss': 0.6117655038833618,
 'eval_accuracy': {'accuracy': 0.69},
 'eval_f1': {'f1': 0.8165680473372781},
 'eval_runtime': 6.1937,
 'eval_samples_per_second': 16.146,
 'eval_steps_per_second': 1.13,
 'epoch': 2.0}