In [1]:
from transformers import DataCollatorWithPadding

In [2]:
from transformers import AutoTokenizer

In [3]:
from datasets import load_dataset

dataset_all = load_dataset("csv", data_files="books_train.csv")

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [5]:
dataset = dataset_all['train'].train_test_split(test_size = 0.1)

In [6]:
train = dataset['train']
test = dataset["test"]
print(test)

Dataset({
    features: ['title', 'score', 'id', 'url', 'comms_num', 'created', 'body', 'timestamp', 'recomendation', 'label'],
    num_rows: 21
})


In [7]:
def preprocess_function(examples):
   return tokenizer(examples["body"], truncation=True)
tokenized_train = train.map(preprocess_function, batched=True)
tokenized_test = test.map(preprocess_function, batched=True)

Map:   0%|          | 0/180 [00:00<?, ? examples/s]

Map:   0%|          | 0/21 [00:00<?, ? examples/s]

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Training the model 

In [9]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
import numpy as np
import evaluate
 

In [11]:
def compute_metrics(eval_pred):
   load_accuracy = evaluate.load("accuracy")
   load_f1 = evaluate.load("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)
   f1 = load_f1.compute(predictions=predictions, references=labels)
   return {"accuracy": accuracy, "f1": f1}

In [12]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [13]:
tokenized_train

Dataset({
    features: ['title', 'score', 'id', 'url', 'comms_num', 'created', 'body', 'timestamp', 'recomendation', 'label', 'input_ids', 'attention_mask'],
    num_rows: 180
})

In [14]:
tokenized_test

Dataset({
    features: ['title', 'score', 'id', 'url', 'comms_num', 'created', 'body', 'timestamp', 'recomendation', 'label', 'input_ids', 'attention_mask'],
    num_rows: 21
})

In [16]:
from transformers import TrainingArguments, Trainer
 
repo_name = "finetuning-sentiment-model-200-samples"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
)
 
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [17]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=24, training_loss=0.6177810827891032, metrics={'train_runtime': 58.1917, 'train_samples_per_second': 6.186, 'train_steps_per_second': 0.412, 'total_flos': 46965902232864.0, 'train_loss': 0.6177810827891032, 'epoch': 2.0})

In [18]:
trainer.evaluate()

{'eval_loss': 0.6431981921195984,
 'eval_accuracy': {'accuracy': 0.6666666666666666},
 'eval_f1': {'f1': 0.8},
 'eval_runtime': 2.3241,
 'eval_samples_per_second': 9.036,
 'eval_steps_per_second': 0.861,
 'epoch': 2.0}