# **Fine Tuning for Sequence Classification**

In [1]:
# ! pip install nlp

In [2]:
from nlp import load_dataset
import torch
import numpy as np



In [3]:
dataset = load_dataset('csv', data_files="./data/imdbs.csv", split="train")
print()
print(type(dataset))

Using custom data configuration default



<class 'nlp.arrow_dataset.Dataset'>


In [4]:
dataset = dataset.train_test_split(test_size=0.3)

dataset

{'train': Dataset(features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}, num_rows: 70),
 'test': Dataset(features: {'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None)}, num_rows: 30)}

In [5]:
train_set = dataset["train"]
test_set = dataset["test"]

## **Load BERT Model and Tokenizer**

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## **Preprocessing the Data**

In [7]:
# !pip install dill==0.3.4

In [8]:
def preprocess(data):
    return tokenizer(data["text"], padding=True, truncation=True)

In [9]:
train_set = train_set.map(preprocess, batched=True, batch_size=len(train_set), load_from_cache_file=False)

test_set = test_set.map(preprocess, batched=True, batch_size=len(test_set), load_from_cache_file=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
train_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

## **Train the Model**

In [11]:
batch_size = 8
epochs = 2

In [12]:
warmup_steps = 500
weight_decay = 0.01

In [13]:
# !pip install accelerate

In [14]:
# Define Training Arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    logging_dir='./logs',
)

In [15]:
# Define Trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set
)

In [16]:
# Start the training

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.695583
2,No log,0.68192


TrainOutput(global_step=18, training_loss=0.6928195423550076, metrics={'train_runtime': 18.4656, 'train_samples_per_second': 7.582, 'train_steps_per_second': 0.975, 'total_flos': 36835547750400.0, 'train_loss': 0.6928195423550076, 'epoch': 2.0})

In [17]:
trainer.evaluate()

{'eval_loss': 0.6819199919700623,
 'eval_runtime': 0.7959,
 'eval_samples_per_second': 37.695,
 'eval_steps_per_second': 5.026,
 'epoch': 2.0}