In [1]:
from transformers import DataCollatorWithPadding

In [2]:
from transformers import AutoTokenizer

In [3]:
from datasets import load_dataset
import torch

In [4]:
train_dataset = load_dataset("csv", data_files="books_train.csv")["train"]
test_dataset = load_dataset("csv", data_files="books_test.csv")["train"]

In [5]:
device = torch.device('cuda')
# torch.set_num_threads(16)

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

In [7]:
def preprocess_function(examples):
    inputs = tokenizer(examples["body"], truncation=True, return_tensors="pt", padding=True).to(device)
    return inputs
 
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [8]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [9]:
tokenized_train

Dataset({
    features: ['body', 'label', 'input_ids', 'attention_mask'],
    num_rows: 200
})

### Training the model 

In [10]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilgpt2", num_labels=2).to(device)
model.config.pad_token_id = model.config.eos_token_id

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
import numpy as np
import evaluate
 

In [12]:
def compute_metrics(eval_pred):
   load_accuracy = evaluate.load("accuracy")
   load_f1 = evaluate.load("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)
   f1 = load_f1.compute(predictions=predictions, references=labels)
   return {"accuracy": accuracy, "f1": f1}

In [13]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [18]:
from transformers import TrainingArguments, Trainer
 
repo_name = "finetuning-gpt2-Reddit-books"
 
training_args = TrainingArguments(
   output_dir=repo_name,
   learning_rate=2e-5,
   per_device_train_batch_size=1,
   per_device_eval_batch_size=1,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   push_to_hub=True,
   use_cpu=False
)
 
trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tokenized_train,
   eval_dataset=tokenized_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [19]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=400, training_loss=1.5504039001464844, metrics={'train_runtime': 68.346, 'train_samples_per_second': 5.853, 'train_steps_per_second': 5.853, 'total_flos': 104522475110400.0, 'train_loss': 1.5504039001464844, 'epoch': 2.0})

In [20]:
trainer.evaluate()

{'eval_loss': 1.5367991924285889,
 'eval_accuracy': {'accuracy': 0.72},
 'eval_f1': {'f1': 0.8372093023255814},
 'eval_runtime': 5.3963,
 'eval_samples_per_second': 18.531,
 'eval_steps_per_second': 18.531,
 'epoch': 2.0}

In [16]:
trainer.evaluate()

{'eval_loss': 0.584126353263855,
 'eval_accuracy': {'accuracy': 0.72},
 'eval_f1': {'f1': 0.8372093023255814},
 'eval_runtime': 3.1473,
 'eval_samples_per_second': 31.774,
 'eval_steps_per_second': 2.224,
 'epoch': 2.0}