# **Fine Tuning for Sequence Classification**

1. Load the Custome Data
2. Load the Pre-Trained Model and Tokenizer
3. Preprocess the data

### **Step 1: Load the Custom Data**

In [1]:
from datasets import load_dataset

# Use load_dataset() to load a csv file 
dataset = load_dataset('csv', data_files="./data/imdbs.csv", split="train")
print()
print(type(dataset))




<class 'datasets.arrow_dataset.Dataset'>


In [2]:
# Split the dataset into train and test

dataset = dataset.train_test_split(test_size=0.3)

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 70
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 30
    })
})

In [3]:
# Separate train_set and test_set

train_set = dataset["train"]
test_set = dataset["test"]

In [4]:
train_set

Dataset({
    features: ['text', 'label'],
    num_rows: 70
})

### **Step 2: Load the Pre-Trained Model and Tokenizer**

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### **Step 3: Preprocess the data**

In [6]:
# !pip install dill==0.3.4

In [7]:
def preprocess(data):
    return tokenizer(data["text"], padding=True, truncation=True)

In [8]:
# Dataset.map() method applies a pre-processing function to each row
# batch=True allows to preprocess multiple rows at a time in parallel

train_set = train_set.map(preprocess, batched=True, batch_size=len(train_set), load_from_cache_file=False)

test_set = test_set.map(preprocess, batched=True, batch_size=len(test_set), load_from_cache_file=False)

Map:   0%|          | 0/70 [00:00<?, ? examples/s]

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

In [9]:
train_set

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 70
})

In [10]:
# Setting the pytorch format for input_ids, attention_mask and label

train_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [11]:
train_set

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 70
})

## **Train the Model**

In [12]:
# Define the batch size and number of epochs to fine tune the model

batch_size = 8
epochs = 2

In [13]:
# Hyperparameters

warmup_steps = 500
weight_decay = 0.01

In [14]:
# !pip install accelerate

In [15]:
# Define Training Arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./.results',
    eval_strategy='epoch',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    logging_dir='./.logs',
)

In [16]:
# Define Trainer
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set
)

In [17]:
# Start the training

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.691023
2,No log,0.692349


TrainOutput(global_step=18, training_loss=0.6934383180406358, metrics={'train_runtime': 11.9003, 'train_samples_per_second': 11.764, 'train_steps_per_second': 1.513, 'total_flos': 36835547750400.0, 'train_loss': 0.6934383180406358, 'epoch': 2.0})

In [18]:
trainer.evaluate()

{'eval_loss': 0.6923487782478333,
 'eval_runtime': 0.5986,
 'eval_samples_per_second': 50.115,
 'eval_steps_per_second': 6.682,
 'epoch': 2.0}